import pandas as pd import requests import json import cv2 import numpy as np from bs4 import BeautifulSoup def download_metmuseum_data(ids): data_final = [] for id in ids: response = requests.get(f'https://collectionapi.metmuseum.org/public/collection/v1/objects/{id}') if response.status_code == 200: data_final.append(json.loads(response.text)) with open('met_museum_collection.json', 'w') as json_file: json.dump(data_final, json_file) def filter_and_save_data(keys_to_keep): df = pd.read_json('/content/met_museum_collection.json') final_data_df = df[keys_to_keep] final_data_df.to_csv('final_data_df.csv', index=False) final_data_df.to_json('final_data_json.json', orient='records', indent=4) return final_data_df def download_and_convert_image(url): response = requests.get(url) if response.status_code == 200: nparr = np.frombuffer(response.content, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) return img else: return None def extract_description(url): try: response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') description_element = soup.find('div', class_='artwork__intro__desc') if description_element: description = description_element.get_text(strip=True) return description else: return None else: return None except Exception as e: return str(e) def create_master_data(df): df['description'] = df['objectURL'].apply(extract_description) df.to_csv('master_data.csv', index=False) df.to_json('master_data.json', orient='records', indent=4) def main(): # filtered ids based on Indian data from the overall data of metmuseum dataset ids= [1986,7608,9748,38638,86116,86136,86167,86169,86171,86173,86176,86177,86178,86179,86186,86187,86189,86195,86216,86223,126982,127021,127511,308007,443097,443099,444411,444412,444539,444541,444546,444552,444557,444575,444577,444582,444588,444592,444609,444618,444623,444628,444634,444648,444650,444652,444657,444658,444661,444662,444667,444668,444670,444671,444672,444674,444675,444676,444678,444679,444681,444684,444689,444692,444694,444695,444720,444814,444816,444832,444836,444838,444854,444856,444857,444859,444860,444861,444863,444864,444865,444866,444867,444868,444963,444964,444984,444987,444988,444989,445003,445004,445005,445227,445231,445232,445233,445239,445245,445246,445261,445280,445362,445645,445646,445647,445648,445689,445690,445691,445693,445708,445869,445870,445871,445872,445873,445874,445875,445878,445879,445880,445881,445882,445889,445890,445894,445957,445958,445961,445963,445975,445976,445997,445998,445999,446000,446001,446002,446003,446004,446005,446007,446157,446158,446163,446164,446189,446266,446276,446278,446280,446290,446293,446299,446549,446556,446558,446560,446561,446562,446563,446564,446566,446567,446571,446572,446573,446574,446575,446576,446577,446578,446579,446580,446581,446582,446583,446584,446585,446586,446587,446588,446589,446591,446633,446634,446635,446638,446640,446643,446644,446646,446661,446662,446663,446704,446705,446706,446707,446708,446709,446710,446711,446712,446713,446714,446715,446716,446717,446718,446719,446720,446721,446722,446723,446724,446725,446726,446727,446728,446729,446730,446731,446732,446733,446734,446735,446736,446737,446738,446739,446740,446741,446742,446743,446744,446745,446746,446747,446748,446749,446750,446751,446752,446753,446754,446755,446756,446757,446759,446760,446761,446762,446763,446764,446765,446766,446767,446768,446769,446770,446771,446772,446773,446774,446775,446776,446777,446778,446779,446780,446781,446782,446783,446784,446785,446786,446787,446788,446789,446790,446791,446792,446793,446794,446795,446796,446797,446798,446799,446800,446801,446802,446804,446807,446808,446810,446816,446817,446855,446857,446859,446872,446882,446893,446967,446987,446988,446991,446992,446993,447021,447050,447051,447053,447054,447055,447062,447067,447082,447083,447084,447086,447087,447088,447089,447090,447091,447092,447093,447094,447095,447101,447102,447108,447109,447114,447118,447297,447298,447299,447300,447301,447302,447303,447304,447305,447306,447307,447308,447364,447365,447388,447389,447399,447401,447402,447403,447404,447405,447406,447407,447408,447409,447410,447411,447412,447413,447414,447415,447416,447417,447419,447421,447530,447531,447606,447627,447675,447703,447743,447744,447752,447755,447757,447758,447760,447773,447774,447775,447776,447780,447795,447796,447797,447799,447800,447801,447808,447816,447817,447821,447822,447848,447849,447867,447868,447918,447919,447939,447944,447946,447947,447948,447969,448015,448018,448147,448148,448149,448150,448151,448152,448153,448154,448155,448156,448157,448158,448159,448160,448161,448162,448163,448164,448165,448166,448167,448168,448178,448180,448188,448189,448190,448191,448206,448212,448213,448214,448215,448216,448241,448247,448257,448277,448278,448351,448352,448378,448379,448380,448450,448460,448464,448465,448466,448467,448468,448469,448470,448471,448472,448473,448474,448475,448476,448477,448478,448479,448480,448481,448484,448485,448486,448487,448488,448489,448491,448494,448495,448508,448517,448518,448519,448521,448522,448523,448524,448525,448526,448527,448528,448529,448530,448531,448532,448533,448534,448535,448536,448553,448583,448587,448589,448591,448657,448677,448923,448936,448937,449094,449105,450374,450395,450465,450469,450471,450496,450508,450529,450538,450572,450573,450584,450586,450592,450595,450601,450610,450613,450614,450618,450622,450625,450631,450639,450642,450665,450674,450677,450699,450708,450733,450750,450754,450964,450986,451009,451010,451018,451021,451118,451186,451195,451201,451257,451258,451259,451260,451261,451262,451263,451264,451265,451266,451267,451268,451269,451270,451271,451272,451273,451274,451275,451276,451277,451278,451279,451280,451281,451282,451283,451284,451285,451286,451287,451288,451289,451291,451292,451293,451294,451295,451296,451297,451313,451314,451315,451316,451317,451318,451334,451337,451393,451401,451409,451413,451479,451483,451485,451611,451683,451961,451982,452104,452192,452197,452205,452214,452294,452307,452310,452394,452395,452418,452419,452520,452521,452554,452680,452681,452734,452745,452746,452749,452776,452777,452780,452805,452811,452813,452814,452815,452818,452819,452836,452912,452913,452942,452943,452944,452963,453053,453054,453058,453059,453060,453061,453062,453063,453064,453065,453066,453067,453068,453069,453070,453071,453072,453073,453074,453075,453076,453077,453078,453079,453080,453081,453082,453083,453084,453085,453086,453087,453088,453089,453090,453091,453092,453093,453094,453095,453096,453097,453098,453099,453100,453101,453102,453103,453104,453105,453106,453107,453108,453109,453110,453111,453112,453113,453114,453115,453116,453117,453118,453119,453120,453121,453122,453123,453124,453125,453126,453127,453128,453129,453130,453131,453132,453133,453134,453135,453136,453137,453138,453139,453140,453141,453142,453143,453144,453145,453146,453147,453148,453149,453150,453151,453152,453153,453154,453159,453160,453161,453163,453169,453180,453183,453184,453188,453191,453192,453193,453194,453195,453206,453207,453210,453212,453213,453214,453215,453216,453217,453218,453219,453220,453221,453222,453223,453224,453225,453226,453227,453228,453234,453241,453242,453243,453253,453258,453259,453260,453261,453263,453265,453266,453267,453268,453271,453272,453273,453276,453280,453281,453287,453288,453309,453313,453328,453329,453330,453331,453332,453334,453335,453337,453338,453339,453340,453341,453343,453344,453348,453350,453354,453355,453356,453360,453366,453367,453384,453386,453387,453397,453412,453415,453417,453427,453428,453435,453436,453442,453479,453482,453498,453500,453564,453642,453673,453975,453983,454003,454011,454013,454027,454038,454049,454054,454064,454083,454084,454087,454088,454089,454090,454091,454093,454094,454095,454096,454097,454098,454099,454100,454113,454114,454115,454139,454343,454344,454357,454358,454359,454400,454406,454407,454408,454409,454415,454429,454445,454515,454530,454568,454601,454619,454620,454623,454624,454625,454713,454718,454738,454739,454768,455027,455028,455029,455030,455031,455032,455033,455034,455035,455036,455048,455060,455064,455285,455300,455544,455545,455546,455547,455548,456128,456312,456313,456314,456342,456875,456921,456922,456923,456926,456927,456928,456949,456951,456958,456959,456964,456966,456967,456968,456985,456986,457019,457020,457054,457058,457711,457728,457730,457743,457755,457758,457771,457781,457782,457783,457784,457788,457977,458260,458378,458437,458438,458439,458440,458444,458445,500709,500711,500712,500713,500714,500715,500716,500717,500718,500719,500720,500731,500732,500733,500734,500735,500736,500738,500739,500740,500741,500742,500743,500744,500745,500748,500749,500750,500751,500752,500754,500755,500756,500757,500758,500759,500760,500761,500763,500764,500767,500769,500771,500772,500773,500774,500775,500777,500778,500779,500781,500783,500785,500790,500791,500793,500794,500800,500801,500802,500806,500809,500835,500838,500844,500869,500909,500914,500994,501466,501811,501879,501899,501903,501981,502018,502072,502111,502149,502150,502236,502310,503431,503453,503554,503635,503672,503937,504217,505237,505818,506064,506151,632624,646829,659894,659901,659903,659904,659905,659906,662008,684250,688168,688233,688240,688242,698587,698590,698600,698602,698612,698613,698616,698646,698647,698648,698649,698651,698652,698655,698657,698660,698661,698662,698663,698664,698671,698672,698673,707408,717767,734073,752291,761737,772012,781487,781855,781858,781892,784283,784629,817378,820019,821905,822698,823157,823285,827351,835238,838357,843822,843823,853714,872977] # download the data based on ids above data_final = [] for id in ids: response = requests.get(f'https://collectionapi.metmuseum.org/public/collection/v1/objects/{id}') if response.status_code == 200: data_final.append(json.loads(response.text)) # Save the data_final list as JSON with open('met_museum_collection.json', 'w') as json_file: json.dump(data_final, json_file) # Read JSON file into a DataFrame df = pd.read_json('met_museum_collection.json') # List of keys to keep keys_to_keep = [ 'objectID', 'primaryImage', 'department', 'objectName', 'title', 'culture', 'period', 'artistDisplayName', 'objectDate', 'medium', 'dimensions', 'city', 'state', 'country', 'region', 'classification', 'repository', 'objectURL' ] # Filter data final_data_df = df[keys_to_keep] # create new column with description final_data_df['description'] = final_data_df['objectURL'].apply(extract_description) # Save as csv and json final_data_df.to_csv('master_data.csv', index=False) final_data_df.to_json('master_data.json', orient='records', indent=4) if __name__ == "__main__": main()