{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IsB9l3mBIGUN"
      },
      "source": [
        "## Analysis"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%load_ext autoreload\n",
        "%autoreload 2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "from PIL import Image\n",
        "from scipy.stats import pearsonr\n",
        "from utils.get_unique_values import get_unique_values\n",
        "from utils.remove_duplicates import unzip_fn\n",
        "from utils.show_tile_images import show_tile_images\n",
        "import zipfile\n",
        "import json\n",
        "from utils.visualize_bboxes_on_image import draw_text_on_image\n",
        "import numpy as np\n",
        "from sklearn.metrics.pairwise import cosine_similarity\n",
        "import matplotlib.pyplot as plt\n",
        "import tqdm as tqdm\n",
        "from functools import cache\n",
        "from utils.flatten import flatten"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5l6iv7ZrIGUP"
      },
      "outputs": [],
      "source": [
        "# !GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features --depth=1\n",
        "\n",
        "# !wget https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features/resolve/main/data/processed/RVL-CDIP-invoice/vectors.json.zip -P ./data/processed/RVL-CDIP-invoice/\n",
        "\n",
        "\n",
        "\n",
        "# import sys\n",
        "# sys.path.insert(0, './document-similarity-search-using-visual-layout-features')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "172P8Ey8ytD9"
      },
      "outputs": [],
      "source": [
        "# import os\n",
        "# vectors_chunks = os.listdir('/content/document-similarity-search-using-visual-layout-features/data/processed/RVL-CDIP-invoice/vectors.json.zip.chunks')\n",
        "# vectors_chunks.sort(key=lambda x: int(x.split('-')[0]))\n",
        "# vectors_chunks"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ZZD9JBaWa_T_"
      },
      "outputs": [],
      "source": [
        "vectors_df = pd.read_json('./data/local-data/processed/RVL-CDIP-invoice/vectors.json.zip')\n",
        "vectors_df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# https://gemini.google.com/app/8cd4389df12d29e6\n",
        "\n",
        "# https://chat.openai.com/c/a345a9ec-9238-4089-a6c0-bb4d375148eb"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "X0n7rBnZIGUQ"
      },
      "source": [
        "### Correlation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "unique_values = get_unique_values(start=0.17, end=1, count=10*1000)\n",
        "\n",
        "def get_stats(index: int):\n",
        "    vectors = vectors_df.loc[index, 'vectors']\n",
        "    weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n",
        "    reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n",
        "    reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n",
        "    non_zero_vectors, non_zero_uniques = unzip_fn([(vector, unique) for vector, unique in zip(vectors, unique_values) if vector > 0]) if len([i for i in vectors if i > 0]) > 0 else ([], [])\n",
        "\n",
        "    non_zero_vectors__uniques  = pearsonr(non_zero_vectors, non_zero_uniques) if len(non_zero_vectors) > 0 else [0,1]\n",
        "    vectors___unique_values = pearsonr(vectors, unique_values)\n",
        "    vectors___weighted_vectors = pearsonr(vectors, weighted_vectors)\n",
        "    vectors___reduced_vectors = pearsonr(vectors, reduced_vectors)\n",
        "    vectors___reduced_weighted_vectors = pearsonr(vectors, reduced_weighted_vectors)\n",
        "    weighted_vectors___reduced_vectors = pearsonr(weighted_vectors, reduced_vectors)\n",
        "    weighted_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
        "    reduced_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
        "\n",
        "    return {\n",
        "        'non_zero_vectors__uniques': non_zero_vectors__uniques,\n",
        "        'vectors___unique_values': vectors___unique_values,\n",
        "        'vectors___weighted_vectors': vectors___weighted_vectors,\n",
        "        'vectors___reduced_vectors': vectors___reduced_vectors,\n",
        "        'vectors___reduced_weighted_vectors': vectors___reduced_weighted_vectors,\n",
        "        'weighted_vectors___reduced_vectors': weighted_vectors___reduced_vectors,\n",
        "        'weighted_vectors___reduced_weighted_vectors': weighted_vectors___reduced_weighted_vectors,\n",
        "        'reduced_vectors___reduced_weighted_vectors': reduced_vectors___reduced_weighted_vectors,\n",
        "    }\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "from scipy.signal import convolve\n",
        "kernel = np.array([0.25, 0.5, 0.25])  # Example kernel for simple averaging\n",
        "\n",
        "def smooth_vector(vector):\n",
        "    # Perform convolution\n",
        "    smoothed_vector = convolve(vector, kernel, mode='same') / sum(kernel)\n",
        "    return smoothed_vector\n",
        "\n",
        "def get_modified_stats(image_1_index: int, image_2_index: int, vector_column: str = 'vectors', plot = False):\n",
        "    image_1_values = vectors_df.loc[image_1_index, vector_column]\n",
        "    image_2_values = vectors_df.loc[image_2_index, vector_column]\n",
        "\n",
        "    image_1_matrix = np.array(image_1_values)\n",
        "    image_2_matrix = np.array(image_2_values)\n",
        "\n",
        "    vector_1_zero_indices = image_1_matrix == 0\n",
        "    vector_2_zero_indices = image_2_matrix == 0\n",
        "\n",
        "    image_1_matrix[vector_1_zero_indices] = unique_values[vector_1_zero_indices]\n",
        "    image_2_matrix[vector_2_zero_indices] = unique_values[vector_2_zero_indices]\n",
        "\n",
        "    _old_pearsonr = pearsonr(image_1_values, image_2_values)\n",
        "    [[_old_cosine_similarity]] = cosine_similarity([image_1_values], [image_2_values])\n",
        "    _pearsonr = pearsonr(image_1_matrix, image_2_matrix)\n",
        "    [[_cosine_similarity]] = cosine_similarity([image_1_matrix], [image_2_matrix])\n",
        "\n",
        "    image_1_matrix_smooth = smooth_vector(image_1_matrix)\n",
        "    image_2_matrix_smooth = smooth_vector(image_2_matrix)\n",
        "    _pearsonr_smooth = pearsonr(image_1_matrix_smooth, image_2_matrix)\n",
        "    [[_cosine_similarity_smooth]] = cosine_similarity([image_1_matrix_smooth], [image_2_matrix])\n",
        "\n",
        "    permuted_indices = np.random.permutation(len(image_1_matrix))\n",
        "    _pearsonr_random = pearsonr(image_1_matrix[permuted_indices], image_2_matrix[permuted_indices])\n",
        "    [[_cosine_similarity_random]] = cosine_similarity([image_1_matrix[permuted_indices]], [image_2_matrix[permuted_indices]])\n",
        "\n",
        "    if plot:\n",
        "        plt.figure(figsize=(12, 6))\n",
        "        plt.plot(image_1_values, label='image_1_values', color = 'red')\n",
        "        plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', color = 'blue')\n",
        "        # plt.plot(image_1_matrix, label='image_1_matrix', linestyle='--', color = 'blue')\n",
        "        # plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', linestyle='--', color = \"green\")\n",
        "        plt.show()\n",
        "\n",
        "    return {\n",
        "        'old_pearsonr'              : f'{round(_old_pearsonr.statistic, 4)} - {_old_pearsonr.pvalue}',\n",
        "        'old_cosine_similarity'     : round(_old_cosine_similarity, 4),\n",
        "        'pearsonr'                  : f'{round(_pearsonr.statistic, 4)} - {_pearsonr.pvalue}',\n",
        "        'cosine_similarity'         : round(_cosine_similarity, 4),\n",
        "        'pearsonr_smooth'           : f'{round(_pearsonr_smooth.statistic, 4)} - {_pearsonr_smooth.pvalue}',\n",
        "        'cosine_similarity_smooth'  : round(_cosine_similarity_smooth, 4),\n",
        "        'pearsonr_random'           : f'{round(_pearsonr_random.statistic, 4)} - {_pearsonr_random.pvalue}',\n",
        "        'cosine_similarity_random'  : round(_cosine_similarity_random, 4),\n",
        "    }\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "get_stats(19569)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "correlation_results = []\n",
        "for i in tqdm.tqdm(range(len(correlation_results), len(vectors_df))):\n",
        "    correlation_results.append(get_stats(i))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "columns = list(correlation_results[0].keys())\n",
        "fig, axes = plt.subplots(4, 2, figsize=(12, 12))\n",
        "axes = axes.flatten()\n",
        "for i, column in enumerate(columns):\n",
        "    ax = axes[i]\n",
        "    ax.hist([j[column][0] for j in correlation_results], bins=100)\n",
        "    ax.set_title(column)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def correlation_fn(index: int):\n",
        "    vectors = vectors_df.loc[index, 'vectors']\n",
        "    weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n",
        "    reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n",
        "    reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n",
        "    return {\n",
        "        'vectors vs weighted_vectors': pearsonr(vectors, weighted_vectors),\n",
        "        'vectors vs reduced_vectors': pearsonr(vectors, reduced_vectors),\n",
        "        'vectors vs reduced_weighted_vectors': pearsonr(vectors, reduced_weighted_vectors),\n",
        "        'weighted_vectors vs reduced_vectors': pearsonr(weighted_vectors, reduced_vectors),\n",
        "        'weighted_vectors vs reduced_weighted_vectors': pearsonr(weighted_vectors, reduced_weighted_vectors),\n",
        "        'reduced_vectors vs reduced_weighted_vectors': pearsonr(reduced_vectors, reduced_weighted_vectors),\n",
        "    }\n",
        "\n",
        "correlation_results_2 = [correlation_fn(i) for i in tqdm.tqdm(range(len(vectors_df)))]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "\n",
        "columns = list(correlation_results_2[0].keys())\n",
        "fig, axes = plt.subplots(6, 2, figsize=(24, 24))\n",
        "axes = axes.flatten()\n",
        "for i, column in enumerate(columns):\n",
        "    ax = axes[i]\n",
        "    corr = [j[column][0] for j in correlation_results_2]\n",
        "    pvalues = [j[column][1] for j in correlation_results_2]\n",
        "    # ax.hist([j[column][0] for j in correlation_results_2], bins=100)\n",
        "    ax.plot(range(0, len(corr)), corr, label='Correlation', color='blue')\n",
        "    # ax.plot(range(0, len(pvalues)), pvalues, label='pvalues', color='red')\n",
        "    ax.set_title(column)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "\n",
        "columns = list(correlation_results_2[0].keys())\n",
        "fig, axes = plt.subplots(3, 2, figsize=(24, 24))\n",
        "axes = axes.flatten()\n",
        "for i, column in enumerate(columns):\n",
        "    ax = axes[i]\n",
        "    corr = [j[column][0] for j in correlation_results_2]\n",
        "    pvalues = [j[column][1] for j in correlation_results_2]\n",
        "    ax.plot(range(0, len(corr)), corr, label='correlation', color='blue')\n",
        "    ax.plot(range(0, len(pvalues)), pvalues, label='p-value', color='red')\n",
        "    ax.legend(bbox_to_anchor=(1, 0.1), loc='lower right')\n",
        "    ax.set_ylabel('correlation & p-value')\n",
        "    ax.set_xlabel(f'images - {column}')\n",
        "    ax.set_title(column)\n",
        "\n",
        "fig.savefig('/Users/charleskabue/Downloads/vector-correlations.png')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "<hr/>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# vector_columns = ['vectors_column', 'weighted_vectors_column', 'reduced_vectors_column', 'reduced_weighted_vectors_column']\n",
        "# similarities_json = {}\n",
        "# for vector_column in tqdm.tqdm(vector_columns):\n",
        "#     with  zipfile.ZipFile(f'./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/{vector_column}.json.zip', \"r\") as zip_ref:\n",
        "#         similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))\n",
        "#         similarities_json[vector_column] = similarity_vectors_json\n",
        "@cache\n",
        "def get_similarities(filter, vector_column: str = 'vectors_column'):\n",
        "    with  zipfile.ZipFile(f'./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/{vector_column}.json.zip', \"r\") as zip_ref:\n",
        "        similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))\n",
        "    results = [value for value in tqdm.tqdm(similarity_vectors_json) if (filter(value) if filter else True)]\n",
        "    results.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)\n",
        "    similarity_vectors_json = None\n",
        "    return results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "duplicates_matches = get_similarities(\n",
        "    lambda similarity: similarity['cosine_similarity_score'] < 1 and  similarity['document_image_1'] == similarity['document_image_2'], \n",
        "    'reduced_weighted_vectors_column')\n",
        "\n",
        "len(duplicates_matches)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "top_matches = get_similarities(\n",
        "    lambda similarity: similarity['cosine_similarity_score'] > 0.8 and  similarity['document_image_1'] != similarity['document_image_2'], \n",
        "    'reduced_weighted_vectors_column')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def get_image(filename: str):\n",
        "    return Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{filename}')\n",
        "\n",
        "def print_matches(matches, *, per_side = 1, figsize = None, startistics = True):\n",
        "    images = [\n",
        "        [\n",
        "            get_image(match['document_image_1']), \n",
        "            get_image(match['document_image_2']),\n",
        "            \n",
        "        ] + ([\n",
        "            draw_text_on_image(\n",
        "                Image.new(\"RGB\", (800, 1200), 'white'),\n",
        "                [100, 100],\n",
        "                json.dumps(\n",
        "                    get_modified_stats(\n",
        "                        int(match['document_image_1'].split('.')[0]), \n",
        "                        int(match['document_image_2'].split('.')[0]), \n",
        "                        'vectors'), \n",
        "                    indent=4),\n",
        "                label_text_size=40,\n",
        "                label_fill_color='white')\n",
        "        ] if startistics else [])\n",
        "        for match\n",
        "        in matches\n",
        "    ]\n",
        "    titles = [\n",
        "        [\n",
        "            f\"{match['document_image_1']}, Similarity - {round(match['cosine_similarity_score'], 4)}\" if startistics else match['document_image_1'],\n",
        "            match['document_image_2'],\n",
        "        ] + (['More Statistics'] if startistics else [])\n",
        "        for match\n",
        "        in matches\n",
        "    ]\n",
        "    width_parts = len(images[0]) * per_side\n",
        "    tile_image = show_tile_images(\n",
        "        images = flatten(images),\n",
        "        titles = flatten(titles),\n",
        "        width_parts = width_parts,\n",
        "        figsize = figsize or (10.2 * width_parts, 30 * (len(images) / width_parts)),\n",
        "        space = 2,\n",
        "        pad = True,\n",
        "        figcolor = '#d3eddd',\n",
        "        title_color = 'white',\n",
        "        title_background_color = 'black',\n",
        "        title_font_size = 25)\n",
        "    return tile_image\n",
        "\n",
        "len([i for i in top_matches if i['cosine_similarity_score'] >= 1])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print_matches(top_matches[0:28])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "index = 44\n",
        "print(top_matches[index]['document_image_1'] + ' - ' + top_matches[index]['document_image_2'])\n",
        "draw_text_on_image(\n",
        "    print_matches([top_matches[index]], figsize=(10, 7)),\n",
        "    [330, 335],\n",
        "    f\"cosine similarity - {round(top_matches[index]['cosine_similarity_score'], 4)}\",\n",
        "    label_text_size=30,\n",
        "    label_fill_color='black',\n",
        "    label_text_color='white',\n",
        "    label_rotate_angle = 90,\n",
        "    label_text_padding = 2\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(duplicates_matches[0])\n",
        "print_matches(duplicates_matches[:10])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from main import app\n",
        "import os\n",
        "\n",
        "model_path = '../detectron2-layout-parser/model_final.pth'\n",
        "config_path = '../detectron2-layout-parser/config.yaml'\n",
        "\n",
        "examples = [f'./demo-examples/{filename}' for filename in os.listdir('./demo-examples/')]\n",
        "app(model_path=model_path, config_path=config_path, examples=examples, debug=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "from PIL import Image\n",
        "import layoutparser as lp\n",
        "from utils.get_features import get_features\n",
        "\n",
        "documents = os.listdir('./data/local-data/raw/RVL-CDIP-invoice')\n",
        "# model_path = './model/trained_model/model_final.pth'\n",
        "# config_path = './model/trained_model/config.yaml'\n",
        "model_path = '../detectron2-layout-parser/model_final.pth'\n",
        "config_path = '../detectron2-layout-parser/config.yaml'\n",
        "label_map = {0: 'Caption', 1: 'Footnote', 2: 'Formula', 3: 'List-item', \n",
        "             4: 'Page-footer', 5: 'Page-header', 6: 'Picture', \n",
        "             7: 'Section-header', 8: 'Table', 9: 'Text', 10: 'Title'}\n",
        "model = lp.Detectron2LayoutModel(\n",
        "    config_path=config_path,\n",
        "    model_path=model_path,\n",
        "    label_map=label_map)\n",
        "\n",
        "for document in documents[0:1]:\n",
        "    features = get_features(\n",
        "        image=Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{document}'),\n",
        "        model=model,\n",
        "        label_names=list(label_map.values()),\n",
        "        width_parts=100,\n",
        "        height_parts=100)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.13"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}