File size: 3,053 Bytes
81414ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
{
  "builder_name": "hupd",
  "citation": "@InProceedings{suzgun2021:hupd,\ntitle = {The Harvard USPTO Patent Dataset},\nauthors={Mirac Suzgun and Suproteem Sarkar and Luke Melas-Kyriazi and Scott Kominers and Stuart Shieber},\nyear={2021}\n}\n",
  "config_name": "sample",
  "dataset_size": 1848322042,
  "description": "\nThe Harvard USPTO Patent Dataset (HUPD) is a large-scale, well-structured, and multi-purpose corpus \nof English-language patent applications filed to the United States Patent and Trademark Office (USPTO) \nbetween 2004 and 2018. With more than 4.5 million patent documents, HUPD is two to three times larger \nthan comparable corpora. Unlike other NLP patent datasets, HUPD contains the inventor-submitted versions \nof patent applications, not the final versions of granted patents, allowing us to study patentability at \nthe time of filing using NLP methods for the first time.\n",
  "download_checksums": {
    "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_jan16_2022-02-22.feather": {
      "num_bytes": 6665746,
      "checksum": null
    },
    "https://huggingface.co/datasets/HUPD/hupd/resolve/main/data/sample-jan-2016.tar.gz": {
      "num_bytes": 387636489,
      "checksum": null
    }
  },
  "download_size": 394302235,
  "features": {
    "patent_number": {
      "dtype": "string",
      "_type": "Value"
    },
    "decision": {
      "dtype": "int64",
      "_type": "Value"
    },
    "title": {
      "dtype": "string",
      "_type": "Value"
    },
    "abstract": {
      "dtype": "string",
      "_type": "Value"
    },
    "claims": {
      "dtype": "string",
      "_type": "Value"
    },
    "background": {
      "dtype": "string",
      "_type": "Value"
    },
    "summary": {
      "dtype": "string",
      "_type": "Value"
    },
    "description": {
      "dtype": "string",
      "_type": "Value"
    },
    "cpc_label": {
      "dtype": "string",
      "_type": "Value"
    },
    "ipc_label": {
      "dtype": "string",
      "_type": "Value"
    },
    "filing_date": {
      "dtype": "string",
      "_type": "Value"
    },
    "patent_issue_date": {
      "dtype": "string",
      "_type": "Value"
    },
    "date_published": {
      "dtype": "string",
      "_type": "Value"
    },
    "examiner_id": {
      "dtype": "string",
      "_type": "Value"
    }
  },
  "homepage": "https://github.com/suzgunmirac/hupd",
  "license": "",
  "size_in_bytes": 2242624277,
  "splits": {
    "train": {
      "name": "train",
      "num_bytes": 1184126558,
      "num_examples": 16153,
      "shard_lengths": [
        7000,
        7000,
        2153
      ],
      "dataset_name": "hupd"
    },
    "validation": {
      "name": "validation",
      "num_bytes": 664195484,
      "num_examples": 9094,
      "shard_lengths": [
        7000,
        2094
      ],
      "dataset_name": "hupd"
    }
  },
  "supervised_keys": {
    "input": "claims",
    "output": "decision"
  },
  "version": {
    "version_str": "0.0.0",
    "major": 0,
    "minor": 0,
    "patch": 0
  }
}