monsoon-nlp commited on
Commit
e58d432
1 Parent(s): 65f1be0

restore small model

Browse files
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-NYC-small
2
+
3
+ ## About
4
+
5
+ GPT2 (small version on HF) fine-tuned on questions and responses from https://reddit.com/r/asknyc
6
+
7
+ I filtered comments to ones with scores >= 3, and responding directly
8
+ to the original post ( = ignoring responses to other commenters).
9
+ I also added many tokens which were common on /r/AskNYC but missing from
10
+ GPT2.
11
+
12
+ The [gpt-nyc](https://huggingface.co/monsoon-nlp/gpt-nyc) repo is based
13
+ on GPT2-Medium and comes off more accurate, but the answers from this
14
+ test model struck me as humorous for their strings of subway transfers
15
+ or rambling answers about apartments.
16
+
17
+ Try prompting with ```question?``` plus two spaces, or ```question? - more info``` plus two spaces
18
+
19
+ ## Blog
20
+
21
+ https://mapmeld.medium.com/gpt-nyc-part-1-9cb698b2e3d
22
+
23
+ ## Notebooks
24
+
25
+ ### Data processing / new tokens
26
+
27
+ https://colab.research.google.com/drive/13BOw0uekoAYB4jjQtaXTn6J_VHatiRLu
28
+
29
+ ### Fine-tuning GPT2 (small)
30
+
31
+ https://colab.research.google.com/drive/1FnXcAh4H-k8dAzixkV5ieygV96ePh3lR
32
+
33
+ ### Predictive text and probabilities
34
+
35
+ Scroll to end of
36
+
37
+ https://colab.research.google.com/drive/1FnXcAh4H-k8dAzixkV5ieygV96ePh3lR
38
+
39
+ to see how to install git-lfs and trick ecco into loading this.
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"DeKalb": 50734, "Chinatown": 50549, "jerky": 50957, "tinder": 50375, "City Line": 50495, "burrito": 50380, "Clason Point": 50458, "Arepa": 50870, "BK": 50362, "macrons": 50857, "Apartment": 50298, "Tudor City": 50567, "Smorrebrod": 50893, "Greenwood Heights": 50506, "Richmond Valley": 50700, "fajita": 50981, "Morris Park": 50472, "Koreatown": 50360, "Gerritsen Beach": 50528, "andouillette": 50943, "Whitlock": 50723, "Fondant": 50916, "buttered": 50960, "Alphabet City": 50555, "Vinegar Hill": 50492, "Raclette": 50839, "hydrant": 50379, "Hamilton Heights": 50577, "junkies": 50430, "NYE": 50355, "Uniqlo": 50356, "Shakshuka": 50845, "sardines": 50932, "falafel": 50877, "skunk": 50386, "Roosevelt Island": 50574, "Halsey": 50739, "Ankimo": 50971, "Port Richmond": 50664, "Meiers Corners": 50661, "Rikers Island": 50589, "St. George": 50667, "Rendang": 50992, "yummy": 50996, "Crispy": 50907, "Randall Manor": 50665, "Oakland Gardens": 50625, "lamington": 50885, "sketchy": 50332, "googling": 50405, "Rugby": 50539, "jamun": 50926, "Spritz": 50831, "West Village": 50554, "Erasmus": 50536, "Cambria Heights": 50635, "Tudor Village": 50619, "co-op": 50302, "assam": 50998, "Soundview": 50714, "metrocard": 50286, "Westchester Square": 50467, "Utrecht": 50757, "Claremont": 50437, "Far Rockaway": 50648, "New Hyde Park": 50640, "Intervale": 50711, "asparagus": 50811, "apfelstrudel": 50952, "spritz": 50832, "Todt Hill": 50689, "dumpling": 50349, "upstate": 50281, "Bleecker": 50769, "Botanic": 50738, "okonomiyaki": 50818, "Norwood": 50718, "goong": 50802, "Souvlaki": 50905, "syr": 50904, "Harding Park": 50459, "Mariners Harbor": 50660, "Jamaica Estates": 50612, "bodega": 50267, "Seamless": 50353, "Williamsbridge": 50482, "Castleton Corners": 50652, "Meadowmere": 50639, "Travis": 50690, "Battery Park City": 50547, "mta": 50271, "Ocean Breeze": 50686, "Woodrow": 50703, "Whitehall": 50773, "mayonnaise": 50912, "Marine Park": 50544, "octopus": 50822, "Siclen": 50765, "fondant": 50917, "souvlaki": 50906, "Bed-Stuy": 50422, "Starrett City": 50500, "Tompkinsville": 50671, "Baisley Park": 50626, "frequented": 50311, "ankimo": 50972, "overpriced": 50306, "no-fee": 50336, "Poutine": 50841, "Canarsie": 50541, "Carroll Gardens": 50501, "Piri-piri": 50991, "Parkchester": 50719, "Prospect Heights": 50510, "pimms": 50867, "Hollis Hills": 50623, "banh mi": 50823, "Lower East Side": 50552, "Dyckman": 50771, "Asparagus": 50812, "Annadale": 50691, "pintxos": 50798, "Peking": 50868, "Flatlands": 50542, "Barcade": 50335, "Knickerbocker": 50747, "Basque": 50920, "New Springville": 50684, "asian": 50407, "Emerson Hill": 50679, "frites": 50883, "Marzipan": 50967, "Middle Village": 50597, "ceviche": 50888, "Sutton Place": 50566, "Rector": 50772, "Taco": 50843, "Buttered": 50959, "custard": 50806, "Visiting": 50364, "Pudim": 50944, "Goi": 50973, "broadway": 50327, "Greenpoint": 50273, "Cheers": 50417, "Fajita": 50980, "Kosciuszko": 50748, "Farragut": 50537, "New Dorp": 50683, "Midtown": 50265, "Pelham Bay": 50464, "Highland Park": 50498, "iberico": 50865, "University Heights": 50444, "tart": 50808, "walkup": 50342, "Beverley": 50728, "Mayura": 50897, "gulab": 50925, "Halal": 50313, "masala": 50826, "Thrift": 50358, "bagels": 50278, "Sutphin": 50794, "Bellerose": 50633, "Baychester": 50476, "Concourse Village": 50438, "Manhattanville": 50578, "Dosa": 50827, "Tribeca": 50338, "Flatiron": 50339, "midwest": 50425, "Goong": 50801, "ambiance": 50368, "Cobble Hill": 50502, "Eltingville": 50694, "Bensonhurst": 50518, "Delancey": 50770, "Wagyu": 50899, "Sheepshead": 50762, "Seaside": 50650, "Nassau": 50426, "Nevins": 50756, "Jackson Heights": 50592, "Bath Beach": 50517, "fiancée": 50376, "skewer": 50946, "Montrose": 50752, "nyc": 50257, "Bronxdale": 50469, "pigeon": 50951, "Williamsburg": 50259, "College Point": 50603, "Polo Grounds": 50581, "TWC": 50334, "marmite": 50964, "Burnside": 50707, "Longwood": 50436, "Silver Beach": 50465, "arepa": 50871, "Anybody": 50370, "smazeny": 50902, "Fulton Mall": 50491, "grilled": 50820, "South Jamaica": 50631, "Currywurst": 50847, "Lasagna": 50891, "Newkirk": 50758, "Kew Gardens": 50614, "Westchester": 50284, "Olinville": 50480, "Chauncey": 50729, "peking": 50869, "Elmhurst": 50784, "Cobble": 50301, "Locust Point": 50463, "looped": 50412, "Manhattan Valley": 50571, "Paella": 50813, "Highbridge": 50441, "Park Slope": 50504, "Wingate": 50513, "Stuyvesant Town": 50565, "Coney": 50270, "Hudson Yards": 50559, "Asking": 50300, "tbh": 50319, "Neponsit": 50649, "Spuyten Duyvil": 50453, "pho": 50861, "Astoria Heights": 50585, "Grymes Hill": 50658, "Pennylvania": 50760, "parmesan": 50855, "neapolitan": 51001, "Lincoln Square": 50570, "T-Mobile": 50351, "Mount Hope": 50443, "beetroot": 50873, "parm": 50854, "Borough Park": 50521, "pastel": 50989, "durian": 50954, "Middletown": 50713, "Staying": 50428, "Pintxos": 50797, "Yorkville": 50576, "googled": 50391, "Wondering": 50398, "South Beach": 50688, "Throop": 50746, "West Farms": 50448, "brooklyn": 50279, "Knafeh": 50858, "scallop": 50923, "realtor": 50395, "Old Town": 50687, "Hoboken": 50277, "Crotona Park": 50439, "mayo": 50913, "escalator": 50294, "moules": 50881, "Bedford-Stuyvesant": 50316, "go-to": 50331, "New Lots": 50499, "Pastel": 50988, "Brighton Beach": 50525, "Marble Hill": 50452, "Howard Beach": 50617, "roast": 50879, "gentrified": 50295, "Aperol": 50829, "Conduit": 50781, "Stillwell": 50731, "Egbertville": 50678, "agnello": 50937, "Little Italy": 50551, "Remsen Village": 50538, "Cypress Hills": 50496, "Cortlandt": 50722, "gf": 50366, "Hwy": 50745, "Cypress": 50708, "Arrochar": 50674, "Cloisters": 50320, "dryer": 50387, "Sheepshead Bay": 50534, "Elderts": 50778, "full-time": 50409, "patata": 50938, "cuon": 50975, "confrontational": 50396, "Glendale": 50595, "wkend": 50402, "Equinox": 50303, "Grilled": 50819, "peeves": 50373, "Pastrami": 50815, "Financial District": 50548, "Okonomiyaki": 50817, "Flatbush": 50527, "york": 50347, "Ocean Parkway": 50524, "Two Bridges": 50557, "Windsor Terrace": 50508, "Queens Village": 50641, "Whitestone": 50607, "Brownsville": 50291, "tripping": 50372, "Steinway": 50793, "oyster": 50930, "taping": 50429, "shaksuka": 50846, "Bao": 50834, "Edenwald": 50477, "Hoyt": 50741, "Bayside": 50621, "Melrose": 50432, "Lenox Hill": 50573, "dollhouse": 50403, "Prince's Bay": 50699, "Suggestions": 50369, "low-income": 50308, "Rockaway": 50791, "Belle Harbor": 50645, "Wakefield": 50481, "guarantor": 50330, "LaGuardia": 50371, "Stinky": 50965, "Grasmere": 50681, "Winthrop": 50767, "Hummus": 50969, "Kilo": 50400, "Greenridge": 50696, "ohmi-gyu": 50977, "Lowery": 50777, "Bakery": 50314, "alturas": 50927, "Kingsbridge Heights": 50451, "Realtor": 50394, "Van Cortlandt": 50454, "Fairway": 50317, "gentrification": 50344, "Flushing": 50737, "Ozone": 50788, "pudim": 50945, "Reservoir": 50427, "Concord": 50654, "CUNY": 50361, "Fort Greene": 50489, "City Island": 50461, "Pleasant Plains": 50698, "IMO": 50274, "Pho": 50860, "Parmesan": 50853, "mandazi": 50890, "Kingsbridge": 50712, "Googling": 50406, "Dongan Hills": 50677, "Citibike": 50324, "Jamon": 50862, "jamon": 50863, "shoebox": 50385, "Morris Heights": 50442, "Luger": 50359, "quinces": 50933, "Cheapest": 50418, "Dyre": 50709, "Tart": 50807, "Wyckoff": 50755, "artichoke": 50910, "Croissant": 50982, "Smazeny": 50901, "Bergen": 50727, "Lefferts": 50789, "Rose Hill": 50562, "Utica": 50733, "Homecrest": 50529, "FYI": 50323, "itinerary": 50365, "Sq": 50329, "wagyu": 50900, "East Tremont": 50447, "Artichoke": 50909, "Zerega": 50725, "Sutter": 50763, "Sling": 50918, "Richmond Hill": 50616, "Sunnyside": 50299, "Lehman": 50705, "Springfield Gardens": 50632, "cliche": 50367, "Rooftop": 50322, "Mayonnaise": 50911, "Ohmi-gyu": 50976, "Wyck": 50783, "Peter Cooper Village": 50564, "Classon": 50730, "Astor": 50348, "Cereal": 50837, "Kew": 50785, "Dutch Kills": 50586, "Harlem": 50580, "Momofuku": 50350, "someplace": 50397, "Yung": 50799, "Livingston": 50659, "Riverdale": 50357, "Randall's Island": 50582, "Anywhere": 50388, "Cannoli": 50803, "Dumbo": 50352, "muamba": 50994, "Kwa": 50956, "Washington Heights": 50584, "Forest Hills": 50600, "Eastchester": 50478, "Co-op City": 50462, "Livonia": 50749, "Manhattan Beach": 50532, "relocating": 50392, "Midwood": 50523, "doorman": 50321, "Rawson": 50776, "Moulded": 50809, "mater": 50928, "Bayswater": 50644, "Parkside": 50759, "Port Morris": 50434, "dosa": 50828, "Nereid": 50717, "Hunters Point": 50591, "Murray Hill": 50563, "palamos": 50935, "Clifton": 50653, "Chilli": 50835, "Corona": 50593, "lechón": 50979, "goulash": 50924, "Douglaston": 50622, "zenzero": 50939, "Liberty Park": 50598, "Lindenwood": 50618, "Fort Wadsworth": 50656, "Fort Hamilton": 50516, "Neapolitan": 51000, "Fieldston": 50450, "asap": 50414, "Boerum Hill": 50485, "Pimms": 50866, "Rutland": 50764, "Graniteville": 50657, "Linden Hill": 50604, "Woodhaven": 50795, "lasagna": 50892, "sauna": 50416, "picky": 50390, "Arlington": 50651, "Euclid": 50736, "Waterside Plaza": 50569, "Hewes": 50740, "marzipan": 50968, "New Brighton": 50662, "Rego": 50421, "Jerky": 50958, "platter": 50411, "NYSC": 50401, "Malba": 50605, "[PAD]": 51003, "Junius": 50744, "Tartare": 50874, "laksa": 50999, "Concourse": 50704, "speakeasy": 50304, "craigslist": 50283, "Racetrack": 50780, "Cortland": 50774, "MetroTech": 50743, "Pelham Parkway": 50474, "Pelham Gardens": 50473, "Bathgate": 50445, "stinky": 50966, "Morrisania": 50440, "SBS": 50354, "Red Hook": 50505, "Allerton": 50468, "Rented": 50393, "Upper East Side": 50575, "Holliswood": 50611, "Willets": 50787, "manhattan": 50264, "Pavlova": 50914, "Rochdale Village": 50629, "pierogi": 50850, "Pk": 50768, "Sunset Park": 50507, "Iberico": 50864, "chilli": 50836, "Fordham": 50710, "nypd": 50272, "Arden Heights": 50692, "Plum Beach": 50533, "Queensboro": 50790, "cronut": 50984, "crispy": 50908, "Willoughby": 50754, "Bergen Beach": 50540, "riddare": 50949, "Falafel": 50876, "Rib": 50886, "Frites": 50882, "crust": 50947, "Andouillette": 50942, "Moules": 50880, "Macrons": 50856, "Silver Lake": 50669, "Kensington": 50522, "preface": 50408, "Port Ivory": 50663, "Marcy": 50751, "croissant": 50983, "Arverne": 50643, "cuz": 50431, "Greenwich Village": 50550, "Mill Island": 50546, "Recommendations": 50374, "Broker": 50378, "Lamington": 50884, "tagine": 50931, "Lorimer": 50750, "Great Kills": 50695, "mayura": 50898, "Cronut": 50985, "hustle": 50310, "Ditmars": 50782, "Saratoga": 50761, "Bulls Head": 50676, "Bushwick": 50261, "moulded": 50810, "cereal": 50838, "tartare": 50875, "Woodlawn": 50724, "East Village": 50556, "Landlord": 50363, "sublet": 50269, "Queensboro Hill": 50606, "Crown Heights": 50509, "Bloomfield": 50675, "knafeh": 50859, "Gowanus": 50503, "Buhre": 50706, "meetup": 50280, "Unionport": 50460, "Elm Park": 50655, "stroller": 50318, "Ridgewood": 50599, "Lechon": 50953, "cannoli": 50804, "Marmite": 50963, "pavlova": 50915, "Ravenswood": 50588, "Floral Park": 50636, "Laurelton": 50638, "Rossville": 50701, "warzone": 50315, "kwa": 50955, "Indian Village": 50470, "decorate": 50399, "shish": 50987, "Fresh Meadows": 50610, "Brookville": 50634, "Morningside": 50337, "fai": 50950, "Astoria": 50258, "Meetup": 50341, "Jumbles": 50919, "movers": 50325, "Utopia": 50613, "ramen": 50305, "Penang": 50997, "Bowery": 50297, "paella": 50814, "LGA": 50290, "Long Island City": 50587, "Moved": 50413, "Williamsburg Houses": 50484, "Blissville": 50590, "Dijon": 50410, "Auburndale": 50620, "Van Nest": 50475, "Masala": 50825, "Aqueduct": 50779, "Bruckner": 50456, "Ocean Hill": 50493, "nightlife": 50275, "Brooklyn Heights": 50486, "Myrtle": 50753, "Maspeth": 50596, "Schermerhorn": 50742, "Clinton Hill": 50488, "Wards Island": 50583, "Sea Gate": 50526, "thrift": 50389, "poutine": 50842, "Dyker Heights": 50515, "Roast": 50878, "gimmicky": 50377, "St. Albans": 50630, "Breezy Point": 50646, "Prospect Lefferts Gardens": 50512, "Upper West Side": 50572, "bumble": 50423, "Seafood": 50961, "blizzard": 50419, "Rosebank": 50666, "Edgemere": 50647, "Shish": 50986, "Ditmas": 50735, "deli": 50285, "Briarwood": 50609, "Georgetown": 50543, "Bay Terrace": 50601, "Mandazi": 50889, "basque": 50921, "doable": 50287, "seafood": 50962, "Kings Highway": 50531, "Laconia": 50471, "Caviar": 50851, "Beetroot": 50872, "Gramercy Park": 50560, "gamba": 50934, "Ditmas Village": 50535, "fattiga": 50948, "Woodlawn Heights": 50483, "Duane": 50340, "midtown": 50260, "Stuyvesant Heights": 50494, "Fulton Ferry": 50490, "Queensbridge": 50775, "Seneca": 50792, "exterminator": 50382, "ConvergecultFiDi": 50293, "Bay Ridge": 50514, "Burrito": 50381, "Willets Point": 50608, "meetups": 50328, "smorrebrod": 50894, "Castle Hill": 50457, "taco": 50844, "aperol": 50830, "nata": 50990, "Jamaica": 50627, "Custard": 50805, "Pelham": 50720, "caviar": 50852, "Beechhurst": 50602, "Morningside Heights": 50579, "Stapleton": 50670, "Syr": 50903, "Woodside": 50796, "Glen Oaks": 50637, "yung": 50800, "bao": 50833, "Pierogi": 50849, "Little Neck": 50624, "Fresh Pond": 50594, "SoHo": 50309, "Weeksville": 50511, "Westerleigh": 50673, "UWS": 50262, "halal": 50312, "Shore Acres": 50668, "kebab": 50896, "Mosholu": 50715, "airbnb": 50346, "Grant City": 50680, "bulgogi": 50929, "Stuy": 50307, "uptown": 50266, "bellini": 50936, "Mill Basin": 50545, "Mott": 50288, "West Brighton": 50672, "Fish Bay": 50479, "downvote": 50282, "Lechón": 50978, "goi": 50974, "Ozone Park": 50615, "East New York": 50497, "raclette": 50840, "Banh mi": 50824, "Throgs Neck": 50466, "Bedford Park": 50449, "Ceviche": 50887, "Massaman": 51002, "renter": 50345, "Tottenville": 50702, "EWR": 50384, "pastrami": 50816, "Turnpike": 50786, "Hunts Point": 50435, "Pkwy": 50716, "Mapleton": 50520, "bagel": 50289, "touristy": 50292, "Aquarium": 50766, "Cortelyou": 50732, "DUMBO": 50326, "Rosedale": 50642, "Bronx River": 50455, "Roommate": 50415, "currywurst": 50848, "Inwood": 50343, "Nostrand": 50726, "LIRR": 50268, "yum": 50995, "Charleston": 50693, "Oakwood": 50685, "Hollis": 50628, "rendang": 50993, "Navy Yard": 50487, "Hell's Kitchen": 50558, "yelp": 50263, "Highline": 50333, "AirBnB": 50296, "Kebab": 50895, "Scallop": 50922, "Midland Beach": 50682, "sweeper": 50404, "Tremont": 50721, "Kips Bay": 50561, "tortellini": 50940, "Turtle Bay": 50568, "Gravesend": 50519, "drizzling": 50383, "Octopus": 50821, "surcharge": 50424, "NoHo": 50553, "newyork": 50276, "Huguenot": 50697, "Belmont": 50446, "hummus": 50970, "Kings Bay": 50530, "Mott Haven": 50433, "brodo": 50941, "Brokers": 50420}
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/content/gpt-nyc",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_name": "gpt2",
15
+ "model_type": "gpt2",
16
+ "n_ctx": 1024,
17
+ "n_embd": 768,
18
+ "n_head": 12,
19
+ "n_inner": null,
20
+ "n_layer": 12,
21
+ "n_positions": 1024,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "transformers_version": "4.6.0",
36
+ "use_cache": true,
37
+ "vocab_size": 51004
38
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bc8d1bb0e8b93d83756b1640cfc156243e7b56780951c593651ee570dffc202
3
+ size 512703099
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>", "pad_token": "[PAD]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2"}
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6f1ffd42449fc959e8deab18ec19dd0e2fc08ee5d28882680825b7e1a6fba8e
3
+ size 2415
vocab.json ADDED
The diff for this file is too large to render. See raw diff