{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('/Users/ludovicaschaerf/Desktop/Data/omniart_v3_datadump.csv')\n",
    "df.shape, df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['collection_origins'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['general_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for val, c in zip(df['artwork_type'].value_counts(), df['artwork_type'].value_counts().index):\n",
    "    print(val,c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['collection_origins'] == 'The British Library [Flickr]']['artwork_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['collection_origins'] == 'The British Library [Flickr]']['general_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['collection_origins'] == 'Brill Iconclass Arkyves']['artwork_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['collection_origins'] == 'The Met 17']['artwork_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['collection_origins'] == 'DeviantArt']['artwork_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['collection_origins'] == 'WikiArts 17']['artwork_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['collection_origins'] == 'MOMA - New York']['artwork_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered = df[~df['artwork_type'].fillna('').str.contains('book')]\n",
    "df_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered = df_filtered[~df_filtered['artwork_type'].fillna('').str.contains('illustr')]\n",
    "df_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered = df_filtered[~df_filtered['artwork_type'].fillna('').str.contains('unknown')]\n",
    "df_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "to_remove = df_filtered[df_filtered['collection_origins'] == 'Brill Iconclass Arkyves'][~(df_filtered['artwork_type'] == 'image')].index\n",
    "df_filtered = df_filtered.drop(to_remove, axis=0)\n",
    "df_filtered.shape, df_filtered[df_filtered['collection_origins'] == 'Brill Iconclass Arkyves']['artwork_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered = df_filtered[~(df_filtered['collection_origins'] == 'The British Library [Flickr]')]\n",
    "df_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered = df_filtered.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered = df_filtered[~(df_filtered['collection_origins'] == 'The Met 17')]\n",
    "df_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered = df[df['artwork_type'] == 'textiles'].reset_index()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image\n",
    "import requests\n",
    "from io import BytesIO\n",
    "\n",
    "\n",
    "print(df_filtered['image_url'][0])\n",
    "# Download the image\n",
    "response = requests.get(df_filtered['image_url'][0])\n",
    "image_data = response.content\n",
    "\n",
    "# Open and display the image\n",
    "image = Image.open(BytesIO(image_data))\n",
    "image.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'random' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m/Users/ludovicaschaerf/Desktop/latent-space-theories/view_data.ipynb Cell 23\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/ludovicaschaerf/Desktop/latent-space-theories/view_data.ipynb#X25sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39m# Randomly select 20 images\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/ludovicaschaerf/Desktop/latent-space-theories/view_data.ipynb#X25sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m random_images \u001b[39m=\u001b[39m random\u001b[39m.\u001b[39msample(\u001b[39mlist\u001b[39m(df_filtered[\u001b[39m'\u001b[39m\u001b[39mimage_url\u001b[39m\u001b[39m'\u001b[39m]), \u001b[39m20\u001b[39m\u001b[39m*\u001b[39m\u001b[39m20\u001b[39m)\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/ludovicaschaerf/Desktop/latent-space-theories/view_data.ipynb#X25sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39m# Set up the grid layout\u001b[39;00m\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/ludovicaschaerf/Desktop/latent-space-theories/view_data.ipynb#X25sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m grid_size \u001b[39m=\u001b[39m (\u001b[39m20\u001b[39m, \u001b[39m20\u001b[39m)\n",
      "\u001b[0;31mNameError\u001b[0m: name 'random' is not defined"
     ]
    }
   ],
   "source": [
    "# Randomly select 20 images\n",
    "random_images = random.sample(list(df_filtered['image_url']), 20*20)\n",
    "\n",
    "# Set up the grid layout\n",
    "grid_size = (20, 20)\n",
    "fig, axs = plt.subplots(*grid_size, figsize=(100, 100))\n",
    "\n",
    "# Display the images in the grid\n",
    "for i, image_link in enumerate(random_images):\n",
    "    response = requests.get(image_link)\n",
    "    image_data = response.content\n",
    "    try:\n",
    "        image = plt.imread(BytesIO(image_data), format='auto')\n",
    "    except (OSError, IOError,):\n",
    "        print(f\"Skipped image {i + 1} due to an error\")\n",
    "\n",
    "    # Compute the grid coordinates\n",
    "    x = i % grid_size[1]\n",
    "    y = i // grid_size[1]\n",
    "    \n",
    "    # Display the image in the corresponding grid cell\n",
    "    axs[y, x].imshow(image)\n",
    "    axs[y, x].axis('off')\n",
    "\n",
    "# Adjust spacing and display the grid of images\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered['collection_origins'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered = df_filtered[df_filtered['collection_origins']=='The Met 17'].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered['omni_id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered.loc[0, ['original_id_in_collection', 'artwork_name', 'artist_full_name']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "met_data = pd.read_csv('/Users/ludovicaschaerf/Desktop/Data/MetObjects.csv')\n",
    "met_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered.to_csv('/Users/ludovicaschaerf/Desktop/Data/omniart_v3_textiles.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered.to_csv('/Users/ludovicaschaerf/Desktop/Data/omniart_v3_filtered.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "req = requests.get(\"https://api.vam.ac.uk/v2/objects/search?images_exist=true&id_category=THES48885&kw_object_type=Textile%20design&kw_object_type=Furnishing%20fabric&page_size=100\")\n",
    "object_data = req.json()\n",
    "print(object_data)\n",
    "object_info = object_data[\"info\"]\n",
    "object_records = object_data[\"records\"]\n",
    "record_count = object_info[\"record_count\"]\n",
    "print(f\"There are {record_count} objects that have these aspects in the record\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import urllib.request\n",
    "\n",
    "def download_image(url, file_path, file_name):\n",
    "    full_path = file_path + file_name + '.jpg'\n",
    "    urllib.request.urlretrieve(url, full_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(f\"https://api.vam.ac.uk/v2/objects/search?images_exist=true&id_category=THES48885&kw_object_type=Textile%20design&response_format=csv&page={0}&kw_object_type=Furnishing%20fabric&page_size=100\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f\"https://framemark.vam.ac.uk/collections/{df.loc[0, '_primaryImageId']}/full/735,/0/default.jpg\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_path = '/Users/ludovicaschaerf/Desktop/Data/VA_textiles/'\n",
    "\n",
    "for i in tqdm(range(100)):\n",
    "    df = pd.read_csv(f\"https://api.vam.ac.uk/v2/objects/search?images_exist=true&id_category=THES48885&kw_object_type=Textile%20design&response_format=csv&kw_object_type=Furnishing%20fabric&page_size=100&page={i}\")\n",
    "        \n",
    "    df.to_csv(out_path + f'info_{i}.csv', index = False)\n",
    "            \n",
    "    for j in range(100):\n",
    "        im_url = f\"https://framemark.vam.ac.uk/collections/{df.loc[j, '_primaryImageId']}/full/735,/0/default.jpg\"\n",
    "        name = df.loc[j, 'systemNumber']\n",
    "            \n",
    "        download_image(im_url, out_path, name)\n",
    "        \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(100):\n",
    "    df = pd.read_csv(out_path + f'info_{i}.csv')\n",
    "    \n",
    "    if i == 0:\n",
    "        total_df = df\n",
    "        print(total_df.head())\n",
    "    else:\n",
    "        total_df = pd.concat([total_df, df], axis=0)\n",
    "    \n",
    "print(total_df.shape)\n",
    "total_df.to_csv(out_path + f'complete_info.csv', index = False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "image = object_records[4]['_images']['_iiif_image_base_url'] + 'full/735,/0/default.jpg'\n",
    "image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for rec in tqdm(object_records):\n",
    "    im_url = rec['_images']['_iiif_image_base_url'] + 'full/735,/0/default.jpg'\n",
    "    name = rec['systemNumber']\n",
    "    \n",
    "    download_image(im_url, out_path, name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import undetected_chromedriver as uc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "def chromeUndetectableDriver():\n",
    "    driver = uc.Chrome()\n",
    "    return driver\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_path = '/Users/ludovicaschaerf/Desktop/Data/GWU_textiles'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import time\n",
    "\n",
    "from bs4 import BeautifulSoup as bs\n",
    "\n",
    "def scrape_gwm():\n",
    "    driver = chromeUndetectableDriver()\n",
    "    driver.get('https://collections-gwu.zetcom.net/en/collection/?f=withImages&f=publicDomain&ff=%7B%22classification_en_s%22%3A%5B%22Textile%22%5D%7D&om=5')\n",
    "    \n",
    "    d = {'Filename': [], 'Title': [], 'Geography': [], 'Culture': [], 'Materials': [],\n",
    "         'Collection': [], 'Accession Number': [], 'Credit Line': [], 'Date': [],\n",
    "         'Copyright': [], 'Object Type': [], 'Dimensions': [], 'Structure': [], 'Used in': []}\n",
    "\n",
    "    c = 1\n",
    "    # scroll down until full page is rendered\n",
    "    soup = None\n",
    "    time.sleep(10)\n",
    "    soup = bs(driver.page_source, 'html.parser')\n",
    "    print(soup)\n",
    "    # while True and c < 500:  # c < 100 to prevent infinite loop\n",
    "    #     c += 1\n",
    "    #     time.sleep(3)\n",
    "    #     driver.execute_script(\"window.scrollTo(0,document.body.scrollHeight)\")\n",
    "    #     ele = driver.find_element(\"xpath\", '//*[@class=\"SearchFormResults-buttonViewMore\"]')\n",
    "    #     ele.click()\n",
    "    #     time.sleep(3)\n",
    "    #     s = bs(driver.page_source, 'html.parser')\n",
    "    #     if s != soup:\n",
    "    #         soup = s\n",
    "    #         continue\n",
    "    #     else:\n",
    "    #         print('reached end of page')\n",
    "    #         break\n",
    "\n",
    "    for a in tqdm(\n",
    "            soup.findAll('div', id=re.compile('search-result-item-object-*'))):  # soup.findAll('div', 'item'): #for old version of website use this\n",
    "        try:\n",
    "            link = 'https://collections-gwu.zetcom.net/' + a.find('a')['href']  # a.find('a')['href'] #for old version of website\n",
    "        except Exception as e:\n",
    "            print('not working', e, a)\n",
    "            continue\n",
    "        \n",
    "        name = a.find('a')['href']\n",
    "        driver.get(link)\n",
    "        time.sleep(2)\n",
    "    \n",
    "        soup_ = bs(driver.page_source, 'html.parser')\n",
    "        i = soup_.find('img', {'class': 'Carousel-itemImage'})\n",
    "        print(i)\n",
    "        \n",
    "        if i is not None:\n",
    "            i =  'https://collections-gwu.zetcom.net/' + i['src']\n",
    "            print(i, name)\n",
    "            download_image(i, out_path, name)\n",
    "            d['Filename'].append(name)\n",
    "            d['Title'].append(soup_.find('h1', {'class': 'text-primary'}).text)\n",
    "            for a in soup_.findAll('div', id='CollectionDetails-Item'):\n",
    "                d[a.find('div', {'class': 'CollectionDetails-Item-Lable'}).text].append(a.find('div', {'class': 'CollectionDetails-Item-Body'}).text)\n",
    "        else:\n",
    "            print('Link was none,', soup_)\n",
    "        \n",
    "        print(d)\n",
    "\n",
    "    # pd.DataFrame(d).to_csv(output_csv, index=False)\n",
    "    # return pd.DataFrame(d)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scrape_gwm()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python /Users/ludovicaschaerf/Desktop/Repos/ailia-models/background_removal/indexnet/indexnet.py --input /Users/ludovicaschaerf/Desktop/Data/VA_textiles/O25180.jpg --savepath /Users/ludovicaschaerf/Desktop/Data/NO_BG/no_bg_O25180.jpg -a u2net"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "art-reco_x86",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}