{ "builder_name": "hupd", "citation": "@InProceedings{suzgun2021:hupd,\ntitle = {The Harvard USPTO Patent Dataset},\nauthors={Mirac Suzgun and Suproteem Sarkar and Luke Melas-Kyriazi and Scott Kominers and Stuart Shieber},\nyear={2021}\n}\n", "config_name": "sample", "dataset_size": 1848322042, "description": "\nThe Harvard USPTO Patent Dataset (HUPD) is a large-scale, well-structured, and multi-purpose corpus \nof English-language patent applications filed to the United States Patent and Trademark Office (USPTO) \nbetween 2004 and 2018. With more than 4.5 million patent documents, HUPD is two to three times larger \nthan comparable corpora. Unlike other NLP patent datasets, HUPD contains the inventor-submitted versions \nof patent applications, not the final versions of granted patents, allowing us to study patentability at \nthe time of filing using NLP methods for the first time.\n", "download_checksums": { "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_jan16_2022-02-22.feather": { "num_bytes": 6665746, "checksum": null }, "https://huggingface.co/datasets/HUPD/hupd/resolve/main/data/sample-jan-2016.tar.gz": { "num_bytes": 387636489, "checksum": null } }, "download_size": 394302235, "features": { "patent_number": { "dtype": "string", "_type": "Value" }, "decision": { "dtype": "int64", "_type": "Value" }, "title": { "dtype": "string", "_type": "Value" }, "abstract": { "dtype": "string", "_type": "Value" }, "claims": { "dtype": "string", "_type": "Value" }, "background": { "dtype": "string", "_type": "Value" }, "summary": { "dtype": "string", "_type": "Value" }, "description": { "dtype": "string", "_type": "Value" }, "cpc_label": { "dtype": "string", "_type": "Value" }, "ipc_label": { "dtype": "string", "_type": "Value" }, "filing_date": { "dtype": "string", "_type": "Value" }, "patent_issue_date": { "dtype": "string", "_type": "Value" }, "date_published": { "dtype": "string", "_type": "Value" }, "examiner_id": { "dtype": "string", "_type": "Value" } }, "homepage": "https://github.com/suzgunmirac/hupd", "license": "", "size_in_bytes": 2242624277, "splits": { "train": { "name": "train", "num_bytes": 1184126558, "num_examples": 16153, "shard_lengths": [ 7000, 7000, 2153 ], "dataset_name": "hupd" }, "validation": { "name": "validation", "num_bytes": 664195484, "num_examples": 9094, "shard_lengths": [ 7000, 2094 ], "dataset_name": "hupd" } }, "supervised_keys": { "input": "claims", "output": "decision" }, "version": { "version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0 } }