{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv('/Users/ludovicaschaerf/Desktop/Data/omniart_v3_datadump.csv')\n", "df.shape, df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['collection_origins'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['general_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for val, c in zip(df['artwork_type'].value_counts(), df['artwork_type'].value_counts().index):\n", " print(val,c)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df['collection_origins'] == 'The British Library [Flickr]']['artwork_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df['collection_origins'] == 'The British Library [Flickr]']['general_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df['collection_origins'] == 'Brill Iconclass Arkyves']['artwork_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df['collection_origins'] == 'The Met 17']['artwork_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df['collection_origins'] == 'DeviantArt']['artwork_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df['collection_origins'] == 'WikiArts 17']['artwork_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df['collection_origins'] == 'MOMA - New York']['artwork_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered = df[~df['artwork_type'].fillna('').str.contains('book')]\n", "df_filtered.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered = df_filtered[~df_filtered['artwork_type'].fillna('').str.contains('illustr')]\n", "df_filtered.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered = df_filtered[~df_filtered['artwork_type'].fillna('').str.contains('unknown')]\n", "df_filtered.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "to_remove = df_filtered[df_filtered['collection_origins'] == 'Brill Iconclass Arkyves'][~(df_filtered['artwork_type'] == 'image')].index\n", "df_filtered = df_filtered.drop(to_remove, axis=0)\n", "df_filtered.shape, df_filtered[df_filtered['collection_origins'] == 'Brill Iconclass Arkyves']['artwork_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered = df_filtered[~(df_filtered['collection_origins'] == 'The British Library [Flickr]')]\n", "df_filtered.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered = df_filtered.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered = df_filtered[~(df_filtered['collection_origins'] == 'The Met 17')]\n", "df_filtered.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered = df[df['artwork_type'] == 'textiles'].reset_index()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered.sample(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from PIL import Image\n", "import requests\n", "from io import BytesIO\n", "\n", "\n", "print(df_filtered['image_url'][0])\n", "# Download the image\n", "response = requests.get(df_filtered['image_url'][0])\n", "image_data = response.content\n", "\n", "# Open and display the image\n", "image = Image.open(BytesIO(image_data))\n", "image.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import random" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'random' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m/Users/ludovicaschaerf/Desktop/latent-space-theories/view_data.ipynb Cell 23\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# Randomly select 20 images\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m random_images \u001b[39m=\u001b[39m random\u001b[39m.\u001b[39msample(\u001b[39mlist\u001b[39m(df_filtered[\u001b[39m'\u001b[39m\u001b[39mimage_url\u001b[39m\u001b[39m'\u001b[39m]), \u001b[39m20\u001b[39m\u001b[39m*\u001b[39m\u001b[39m20\u001b[39m)\n\u001b[1;32m 4\u001b[0m \u001b[39m# Set up the grid layout\u001b[39;00m\n\u001b[1;32m 5\u001b[0m grid_size \u001b[39m=\u001b[39m (\u001b[39m20\u001b[39m, \u001b[39m20\u001b[39m)\n", "\u001b[0;31mNameError\u001b[0m: name 'random' is not defined" ] } ], "source": [ "# Randomly select 20 images\n", "random_images = random.sample(list(df_filtered['image_url']), 20*20)\n", "\n", "# Set up the grid layout\n", "grid_size = (20, 20)\n", "fig, axs = plt.subplots(*grid_size, figsize=(100, 100))\n", "\n", "# Display the images in the grid\n", "for i, image_link in enumerate(random_images):\n", " response = requests.get(image_link)\n", " image_data = response.content\n", " try:\n", " image = plt.imread(BytesIO(image_data), format='auto')\n", " except (OSError, IOError,):\n", " print(f\"Skipped image {i + 1} due to an error\")\n", "\n", " # Compute the grid coordinates\n", " x = i % grid_size[1]\n", " y = i // grid_size[1]\n", " \n", " # Display the image in the corresponding grid cell\n", " axs[y, x].imshow(image)\n", " axs[y, x].axis('off')\n", "\n", "# Adjust spacing and display the grid of images\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered['collection_origins'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered = df_filtered[df_filtered['collection_origins']=='The Met 17'].reset_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered['omni_id']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered.loc[0, ['original_id_in_collection', 'artwork_name', 'artist_full_name']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "met_data = pd.read_csv('/Users/ludovicaschaerf/Desktop/Data/MetObjects.csv')\n", "met_data.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered.to_csv('/Users/ludovicaschaerf/Desktop/Data/omniart_v3_textiles.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_filtered.to_csv('/Users/ludovicaschaerf/Desktop/Data/omniart_v3_filtered.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "req = requests.get(\"https://api.vam.ac.uk/v2/objects/search?images_exist=true&id_category=THES48885&kw_object_type=Textile%20design&kw_object_type=Furnishing%20fabric&page_size=100\")\n", "object_data = req.json()\n", "print(object_data)\n", "object_info = object_data[\"info\"]\n", "object_records = object_data[\"records\"]\n", "record_count = object_info[\"record_count\"]\n", "print(f\"There are {record_count} objects that have these aspects in the record\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import urllib.request\n", "\n", "def download_image(url, file_path, file_name):\n", " full_path = file_path + file_name + '.jpg'\n", " urllib.request.urlretrieve(url, full_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from tqdm import tqdm\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(f\"https://api.vam.ac.uk/v2/objects/search?images_exist=true&id_category=THES48885&kw_object_type=Textile%20design&response_format=csv&page={0}&kw_object_type=Furnishing%20fabric&page_size=100\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f\"https://framemark.vam.ac.uk/collections/{df.loc[0, '_primaryImageId']}/full/735,/0/default.jpg\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "out_path = '/Users/ludovicaschaerf/Desktop/Data/VA_textiles/'\n", "\n", "for i in tqdm(range(100)):\n", " df = pd.read_csv(f\"https://api.vam.ac.uk/v2/objects/search?images_exist=true&id_category=THES48885&kw_object_type=Textile%20design&response_format=csv&kw_object_type=Furnishing%20fabric&page_size=100&page={i}\")\n", " \n", " df.to_csv(out_path + f'info_{i}.csv', index = False)\n", " \n", " for j in range(100):\n", " im_url = f\"https://framemark.vam.ac.uk/collections/{df.loc[j, '_primaryImageId']}/full/735,/0/default.jpg\"\n", " name = df.loc[j, 'systemNumber']\n", " \n", " download_image(im_url, out_path, name)\n", " \n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for i in range(100):\n", " df = pd.read_csv(out_path + f'info_{i}.csv')\n", " \n", " if i == 0:\n", " total_df = df\n", " print(total_df.head())\n", " else:\n", " total_df = pd.concat([total_df, df], axis=0)\n", " \n", "print(total_df.shape)\n", "total_df.to_csv(out_path + f'complete_info.csv', index = False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "image = object_records[4]['_images']['_iiif_image_base_url'] + 'full/735,/0/default.jpg'\n", "image" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for rec in tqdm(object_records):\n", " im_url = rec['_images']['_iiif_image_base_url'] + 'full/735,/0/default.jpg'\n", " name = rec['systemNumber']\n", " \n", " download_image(im_url, out_path, name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import undetected_chromedriver as uc" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sys\n", "\n", "def chromeUndetectableDriver():\n", " driver = uc.Chrome()\n", " return driver\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "out_path = '/Users/ludovicaschaerf/Desktop/Data/GWU_textiles'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "import time\n", "\n", "from bs4 import BeautifulSoup as bs\n", "\n", "def scrape_gwm():\n", " driver = chromeUndetectableDriver()\n", " driver.get('https://collections-gwu.zetcom.net/en/collection/?f=withImages&f=publicDomain&ff=%7B%22classification_en_s%22%3A%5B%22Textile%22%5D%7D&om=5')\n", " \n", " d = {'Filename': [], 'Title': [], 'Geography': [], 'Culture': [], 'Materials': [],\n", " 'Collection': [], 'Accession Number': [], 'Credit Line': [], 'Date': [],\n", " 'Copyright': [], 'Object Type': [], 'Dimensions': [], 'Structure': [], 'Used in': []}\n", "\n", " c = 1\n", " # scroll down until full page is rendered\n", " soup = None\n", " time.sleep(10)\n", " soup = bs(driver.page_source, 'html.parser')\n", " print(soup)\n", " # while True and c < 500: # c < 100 to prevent infinite loop\n", " # c += 1\n", " # time.sleep(3)\n", " # driver.execute_script(\"window.scrollTo(0,document.body.scrollHeight)\")\n", " # ele = driver.find_element(\"xpath\", '//*[@class=\"SearchFormResults-buttonViewMore\"]')\n", " # ele.click()\n", " # time.sleep(3)\n", " # s = bs(driver.page_source, 'html.parser')\n", " # if s != soup:\n", " # soup = s\n", " # continue\n", " # else:\n", " # print('reached end of page')\n", " # break\n", "\n", " for a in tqdm(\n", " soup.findAll('div', id=re.compile('search-result-item-object-*'))): # soup.findAll('div', 'item'): #for old version of website use this\n", " try:\n", " link = 'https://collections-gwu.zetcom.net/' + a.find('a')['href'] # a.find('a')['href'] #for old version of website\n", " except Exception as e:\n", " print('not working', e, a)\n", " continue\n", " \n", " name = a.find('a')['href']\n", " driver.get(link)\n", " time.sleep(2)\n", " \n", " soup_ = bs(driver.page_source, 'html.parser')\n", " i = soup_.find('img', {'class': 'Carousel-itemImage'})\n", " print(i)\n", " \n", " if i is not None:\n", " i = 'https://collections-gwu.zetcom.net/' + i['src']\n", " print(i, name)\n", " download_image(i, out_path, name)\n", " d['Filename'].append(name)\n", " d['Title'].append(soup_.find('h1', {'class': 'text-primary'}).text)\n", " for a in soup_.findAll('div', id='CollectionDetails-Item'):\n", " d[a.find('div', {'class': 'CollectionDetails-Item-Lable'}).text].append(a.find('div', {'class': 'CollectionDetails-Item-Body'}).text)\n", " else:\n", " print('Link was none,', soup_)\n", " \n", " print(d)\n", "\n", " # pd.DataFrame(d).to_csv(output_csv, index=False)\n", " # return pd.DataFrame(d)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scrape_gwm()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python /Users/ludovicaschaerf/Desktop/Repos/ailia-models/background_removal/indexnet/indexnet.py --input /Users/ludovicaschaerf/Desktop/Data/VA_textiles/O25180.jpg --savepath /Users/ludovicaschaerf/Desktop/Data/NO_BG/no_bg_O25180.jpg -a u2net" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "art-reco_x86", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }