Spaces:

mckabue
/

document-similarity-matching-using-visual-layout-features-archive

Build error

App Files Files Community

Charles Kabui commited on Feb 27, 2024

Commit

22a5952

1 Parent(s): 79904b0

analysis.ipynb

Browse files

Files changed (1) hide show

analysis.ipynb +358 -0

analysis.ipynb ADDED Viewed

	@@ -0,0 +1,358 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IsB9l3mBIGUN"
+      },
+      "source": [
+        "## Analysis"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%load_ext autoreload\n",
+        "%autoreload 2\n",
+        "\n",
+        "import pandas as pd\n",
+        "from PIL import Image\n",
+        "from scipy.stats import pearsonr\n",
+        "from utils.get_unique_values import get_unique_values\n",
+        "from utils.remove_duplicates import unzip_fn\n",
+        "from utils.show_tile_images import show_tile_images\n",
+        "import zipfile\n",
+        "import json\n",
+        "from utils.visualize_bboxes_on_image import draw_text_on_image\n",
+        "import numpy as np\n",
+        "from sklearn.metrics.pairwise import cosine_similarity"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5l6iv7ZrIGUP"
+      },
+      "outputs": [],
+      "source": [
+        "# !GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features --depth=1\n",
+        "\n",
+        "# !wget https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features/resolve/main/data/processed/RVL-CDIP-invoice/vectors.json.zip -P ./data/processed/RVL-CDIP-invoice/\n",
+        "\n",
+        "\n",
+        "\n",
+        "# import sys\n",
+        "# sys.path.insert(0, './document-similarity-search-using-visual-layout-features')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "172P8Ey8ytD9"
+      },
+      "outputs": [],
+      "source": [
+        "# import os\n",
+        "# vectors_chunks = os.listdir('/content/document-similarity-search-using-visual-layout-features/data/processed/RVL-CDIP-invoice/vectors.json.zip.chunks')\n",
+        "# vectors_chunks.sort(key=lambda x: int(x.split('-')[0]))\n",
+        "# vectors_chunks"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZZD9JBaWa_T_"
+      },
+      "outputs": [],
+      "source": [
+        "vectors_df = pd.read_json('./data/local-data/processed/RVL-CDIP-invoice/vectors.json.zip')\n",
+        "vectors_df"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# https://gemini.google.com/app/8cd4389df12d29e6\n",
+        "\n",
+        "# https://chat.openai.com/c/a345a9ec-9238-4089-a6c0-bb4d375148eb"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "X0n7rBnZIGUQ"
+      },
+      "source": [
+        "### Correlation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "unique_values = get_unique_values(start=0.17, end=1, count=10*1000)\n",
+        "\n",
+        "def get_stats(index: int):\n",
+        "    vectors = vectors_df.loc[index, 'vectors']\n",
+        "    weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n",
+        "    reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n",
+        "    reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n",
+        "    non_zero_vectors, non_zero_uniques = unzip_fn([(vector, unique) for vector, unique in zip(vectors, unique_values) if vector > 0])\n",
+        "\n",
+        "    non_zero_vectors__uniques  = pearsonr(non_zero_vectors, non_zero_uniques)\n",
+        "    vectors___unique_values = pearsonr(vectors, unique_values)\n",
+        "    vectors___weighted_vectors = pearsonr(vectors, weighted_vectors)\n",
+        "    vectors___reduced_vectors = pearsonr(vectors, reduced_vectors)\n",
+        "    vectors___reduced_weighted_vectors = pearsonr(vectors, reduced_weighted_vectors)\n",
+        "    weighted_vectors___reduced_vectors = pearsonr(weighted_vectors, reduced_vectors)\n",
+        "    weighted_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
+        "    reduced_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
+        "\n",
+        "    return {\n",
+        "        'non_zero_vectors__uniques': non_zero_vectors__uniques,\n",
+        "        'vectors___unique_values': vectors___unique_values,\n",
+        "        'vectors___weighted_vectors': vectors___weighted_vectors,\n",
+        "        'vectors___reduced_vectors': vectors___reduced_vectors,\n",
+        "        'vectors___reduced_weighted_vectors': vectors___reduced_weighted_vectors,\n",
+        "        'weighted_vectors___reduced_vectors': weighted_vectors___reduced_vectors,\n",
+        "        'weighted_vectors___reduced_weighted_vectors': weighted_vectors___reduced_weighted_vectors,\n",
+        "        'reduced_vectors___reduced_weighted_vectors': reduced_vectors___reduced_weighted_vectors,\n",
+        "    }\n",
+        "\n",
+        "from matplotlib import pyplot as plt\n",
+        "from scipy.signal import convolve\n",
+        "kernel = np.array([0.25, 0.5, 0.25])  # Example kernel for simple averaging\n",
+        "\n",
+        "def smooth_vector(vector):\n",
+        "    # Perform convolution\n",
+        "    smoothed_vector = convolve(vector, kernel, mode='same') / sum(kernel)\n",
+        "    return smoothed_vector\n",
+        "\n",
+        "def get_modified_stats(image_1_index: int, image_2_index: int, vector_column: str = 'vectors', plot = False):\n",
+        "    image_1_values = vectors_df.loc[image_1_index, vector_column]\n",
+        "    image_2_values = vectors_df.loc[image_2_index, vector_column]\n",
+        "\n",
+        "    image_1_matrix = np.array(image_1_values)\n",
+        "    image_2_matrix = np.array(image_2_values)\n",
+        "\n",
+        "    vector_1_zero_indices = image_1_matrix == 0\n",
+        "    vector_2_zero_indices = image_2_matrix == 0\n",
+        "\n",
+        "    image_1_matrix[vector_1_zero_indices] = unique_values[vector_1_zero_indices]\n",
+        "    image_2_matrix[vector_2_zero_indices] = unique_values[vector_2_zero_indices]\n",
+        "\n",
+        "    _old_pearsonr = pearsonr(image_1_values, image_2_values)\n",
+        "    [[_old_cosine_similarity]] = cosine_similarity([image_1_values], [image_2_values])\n",
+        "    _pearsonr = pearsonr(image_1_matrix, image_2_matrix)\n",
+        "    [[_cosine_similarity]] = cosine_similarity([image_1_matrix], [image_2_matrix])\n",
+        "\n",
+        "    image_1_matrix_smooth = smooth_vector(image_1_matrix)\n",
+        "    image_2_matrix_smooth = smooth_vector(image_2_matrix)\n",
+        "    _pearsonr_smooth = pearsonr(image_1_matrix_smooth, image_2_matrix)\n",
+        "    [[_cosine_similarity_smooth]] = cosine_similarity([image_1_matrix_smooth], [image_2_matrix])\n",
+        "\n",
+        "    permuted_indices = np.random.permutation(len(image_1_matrix))\n",
+        "    _pearsonr_random = pearsonr(image_1_matrix[permuted_indices], image_2_matrix[permuted_indices])\n",
+        "    [[_cosine_similarity_random]] = cosine_similarity([image_1_matrix[permuted_indices]], [image_2_matrix[permuted_indices]])\n",
+        "\n",
+        "    if plot:\n",
+        "        plt.figure(figsize=(12, 6))\n",
+        "        plt.plot(image_1_values, label='image_1_values', color = 'red')\n",
+        "        plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', color = 'blue')\n",
+        "        # plt.plot(image_1_matrix, label='image_1_matrix', linestyle='--', color = 'blue')\n",
+        "        # plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', linestyle='--', color = \"green\")\n",
+        "        plt.show()\n",
+        "\n",
+        "    return {\n",
+        "        'old_pearsonr'          : f'{round(_old_pearsonr.statistic, 4)} - {_old_pearsonr.pvalue}',\n",
+        "        'old_cosine_similarity' : round(_old_cosine_similarity, 4),\n",
+        "        'pearsonr'              : f'{round(_pearsonr.statistic, 4)} - {_pearsonr.pvalue}',\n",
+        "        'cosine_similarity'     : round(_cosine_similarity, 4),\n",
+        "        'pearsonr_smooth'              : f'{round(_pearsonr_smooth.statistic, 4)} - {_pearsonr_smooth.pvalue}',\n",
+        "        'cosine_similarity_smooth'     : round(_cosine_similarity_smooth, 4),\n",
+        "        'pearsonr_random'              : f'{round(_pearsonr_random.statistic, 4)} - {_pearsonr_random.pvalue}',\n",
+        "        'cosine_similarity_random'     : round(_cosine_similarity_random, 4),\n",
+        "    }\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "get_stats(0)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "with  zipfile.ZipFile('./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/vectors_column.json.zip', \"r\") as zip_ref:\n",
+        "    similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "top_matches = [\n",
+        "    similarity for similarity in \n",
+        "    similarity_vectors_json \n",
+        "    if similarity['cosine_similarity_score'] > 0.8 and \n",
+        "    similarity['document_image_1'] != similarity['document_image_2']]\n",
+        "top_matches.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def get_image(filename: str):\n",
+        "    return Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{filename}')\n",
+        "\n",
+        "def print_matches(matches, two_column_count, *, start = 0):\n",
+        "    images_range = range(start, start + two_column_count)\n",
+        "    images = np.array(\n",
+        "        [\n",
+        "            [\n",
+        "                get_image(matches[i]['document_image_1']), \n",
+        "                get_image(matches[i]['document_image_2']),\n",
+        "                draw_text_on_image(\n",
+        "                    Image.new(\"RGB\", (800, 1200), 'white'),\n",
+        "                    [100, 100],\n",
+        "                    json.dumps(\n",
+        "                        get_modified_stats(\n",
+        "                            int(matches[i]['document_image_1'].split('.')[0]), \n",
+        "                            int(matches[i]['document_image_2'].split('.')[0]), \n",
+        "                            'vectors'), \n",
+        "                        indent=4),\n",
+        "                    label_text_size=40,\n",
+        "                    label_rectangle_color='white',\n",
+        "                ),\n",
+        "            ]\n",
+        "            for i\n",
+        "            in images_range\n",
+        "        ],\n",
+        "        dtype=\"object\").flatten().tolist()\n",
+        "    titles = np.array(\n",
+        "        [\n",
+        "            [\n",
+        "                f\"{matches[i]['document_image_1']}, Similarity - {round(matches[i]['cosine_similarity_score'], 4)}\", \n",
+        "                matches[i]['document_image_2'],\n",
+        "                'More Statistics',\n",
+        "            ]\n",
+        "            for i\n",
+        "            in images_range\n",
+        "        ]).flatten().tolist()\n",
+        "    width_parts = 3\n",
+        "    return show_tile_images(\n",
+        "        images,\n",
+        "        titles = titles,\n",
+        "        width_parts = width_parts,\n",
+        "        figsize = (10.2 * width_parts, 12 * (len(images) / width_parts)),\n",
+        "        space = 2,\n",
+        "        pad = True,\n",
+        "        figcolor = '#d3eddd',\n",
+        "        title_color = 'black',\n",
+        "        title_background_color = 'white',\n",
+        "        title_font_size = 30)\n",
+        "\n",
+        "print_matches(top_matches, 2, start=0)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "almost_similar = [similarity for similarity in \n",
+        "    similarity_vectors_json \n",
+        "    if similarity['cosine_similarity_score'] > 0.9 and similarity['cosine_similarity_score'] < 1.0]\n",
+        "almost_similar.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "print_matches(almost_similar, 5, start=0)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from app import app\n",
+        "\n",
+        "app()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from utils.get_RGB_image import get_RGB_image\n",
+        "from pdf2image import convert_from_path\n",
+        "\n",
+        "pdf = convert_from_path('./sdfes.png', 140)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "get_RGB_image(pdf[0]) "
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.13"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}