Charles Kabui commited on
Commit
22a5952
·
1 Parent(s): 79904b0

analysis.ipynb

Browse files
Files changed (1) hide show
  1. analysis.ipynb +358 -0
analysis.ipynb ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "IsB9l3mBIGUN"
7
+ },
8
+ "source": [
9
+ "## Analysis"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "%load_ext autoreload\n",
19
+ "%autoreload 2\n",
20
+ "\n",
21
+ "import pandas as pd\n",
22
+ "from PIL import Image\n",
23
+ "from scipy.stats import pearsonr\n",
24
+ "from utils.get_unique_values import get_unique_values\n",
25
+ "from utils.remove_duplicates import unzip_fn\n",
26
+ "from utils.show_tile_images import show_tile_images\n",
27
+ "import zipfile\n",
28
+ "import json\n",
29
+ "from utils.visualize_bboxes_on_image import draw_text_on_image\n",
30
+ "import numpy as np\n",
31
+ "from sklearn.metrics.pairwise import cosine_similarity"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": null,
37
+ "metadata": {
38
+ "id": "5l6iv7ZrIGUP"
39
+ },
40
+ "outputs": [],
41
+ "source": [
42
+ "# !GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features --depth=1\n",
43
+ "\n",
44
+ "# !wget https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features/resolve/main/data/processed/RVL-CDIP-invoice/vectors.json.zip -P ./data/processed/RVL-CDIP-invoice/\n",
45
+ "\n",
46
+ "\n",
47
+ "\n",
48
+ "# import sys\n",
49
+ "# sys.path.insert(0, './document-similarity-search-using-visual-layout-features')"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": null,
55
+ "metadata": {
56
+ "id": "172P8Ey8ytD9"
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "# import os\n",
61
+ "# vectors_chunks = os.listdir('/content/document-similarity-search-using-visual-layout-features/data/processed/RVL-CDIP-invoice/vectors.json.zip.chunks')\n",
62
+ "# vectors_chunks.sort(key=lambda x: int(x.split('-')[0]))\n",
63
+ "# vectors_chunks"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "metadata": {
70
+ "id": "ZZD9JBaWa_T_"
71
+ },
72
+ "outputs": [],
73
+ "source": [
74
+ "vectors_df = pd.read_json('./data/local-data/processed/RVL-CDIP-invoice/vectors.json.zip')\n",
75
+ "vectors_df"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "# https://gemini.google.com/app/8cd4389df12d29e6\n",
85
+ "\n",
86
+ "# https://chat.openai.com/c/a345a9ec-9238-4089-a6c0-bb4d375148eb"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "markdown",
91
+ "metadata": {
92
+ "id": "X0n7rBnZIGUQ"
93
+ },
94
+ "source": [
95
+ "### Correlation"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "unique_values = get_unique_values(start=0.17, end=1, count=10*1000)\n",
105
+ "\n",
106
+ "def get_stats(index: int):\n",
107
+ " vectors = vectors_df.loc[index, 'vectors']\n",
108
+ " weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n",
109
+ " reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n",
110
+ " reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n",
111
+ " non_zero_vectors, non_zero_uniques = unzip_fn([(vector, unique) for vector, unique in zip(vectors, unique_values) if vector > 0])\n",
112
+ "\n",
113
+ " non_zero_vectors__uniques = pearsonr(non_zero_vectors, non_zero_uniques)\n",
114
+ " vectors___unique_values = pearsonr(vectors, unique_values)\n",
115
+ " vectors___weighted_vectors = pearsonr(vectors, weighted_vectors)\n",
116
+ " vectors___reduced_vectors = pearsonr(vectors, reduced_vectors)\n",
117
+ " vectors___reduced_weighted_vectors = pearsonr(vectors, reduced_weighted_vectors)\n",
118
+ " weighted_vectors___reduced_vectors = pearsonr(weighted_vectors, reduced_vectors)\n",
119
+ " weighted_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
120
+ " reduced_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
121
+ "\n",
122
+ " return {\n",
123
+ " 'non_zero_vectors__uniques': non_zero_vectors__uniques,\n",
124
+ " 'vectors___unique_values': vectors___unique_values,\n",
125
+ " 'vectors___weighted_vectors': vectors___weighted_vectors,\n",
126
+ " 'vectors___reduced_vectors': vectors___reduced_vectors,\n",
127
+ " 'vectors___reduced_weighted_vectors': vectors___reduced_weighted_vectors,\n",
128
+ " 'weighted_vectors___reduced_vectors': weighted_vectors___reduced_vectors,\n",
129
+ " 'weighted_vectors___reduced_weighted_vectors': weighted_vectors___reduced_weighted_vectors,\n",
130
+ " 'reduced_vectors___reduced_weighted_vectors': reduced_vectors___reduced_weighted_vectors,\n",
131
+ " }\n",
132
+ "\n",
133
+ "from matplotlib import pyplot as plt\n",
134
+ "from scipy.signal import convolve\n",
135
+ "kernel = np.array([0.25, 0.5, 0.25]) # Example kernel for simple averaging\n",
136
+ "\n",
137
+ "def smooth_vector(vector):\n",
138
+ " # Perform convolution\n",
139
+ " smoothed_vector = convolve(vector, kernel, mode='same') / sum(kernel)\n",
140
+ " return smoothed_vector\n",
141
+ "\n",
142
+ "def get_modified_stats(image_1_index: int, image_2_index: int, vector_column: str = 'vectors', plot = False):\n",
143
+ " image_1_values = vectors_df.loc[image_1_index, vector_column]\n",
144
+ " image_2_values = vectors_df.loc[image_2_index, vector_column]\n",
145
+ "\n",
146
+ " image_1_matrix = np.array(image_1_values)\n",
147
+ " image_2_matrix = np.array(image_2_values)\n",
148
+ "\n",
149
+ " vector_1_zero_indices = image_1_matrix == 0\n",
150
+ " vector_2_zero_indices = image_2_matrix == 0\n",
151
+ "\n",
152
+ " image_1_matrix[vector_1_zero_indices] = unique_values[vector_1_zero_indices]\n",
153
+ " image_2_matrix[vector_2_zero_indices] = unique_values[vector_2_zero_indices]\n",
154
+ "\n",
155
+ " _old_pearsonr = pearsonr(image_1_values, image_2_values)\n",
156
+ " [[_old_cosine_similarity]] = cosine_similarity([image_1_values], [image_2_values])\n",
157
+ " _pearsonr = pearsonr(image_1_matrix, image_2_matrix)\n",
158
+ " [[_cosine_similarity]] = cosine_similarity([image_1_matrix], [image_2_matrix])\n",
159
+ "\n",
160
+ " image_1_matrix_smooth = smooth_vector(image_1_matrix)\n",
161
+ " image_2_matrix_smooth = smooth_vector(image_2_matrix)\n",
162
+ " _pearsonr_smooth = pearsonr(image_1_matrix_smooth, image_2_matrix)\n",
163
+ " [[_cosine_similarity_smooth]] = cosine_similarity([image_1_matrix_smooth], [image_2_matrix])\n",
164
+ "\n",
165
+ " permuted_indices = np.random.permutation(len(image_1_matrix))\n",
166
+ " _pearsonr_random = pearsonr(image_1_matrix[permuted_indices], image_2_matrix[permuted_indices])\n",
167
+ " [[_cosine_similarity_random]] = cosine_similarity([image_1_matrix[permuted_indices]], [image_2_matrix[permuted_indices]])\n",
168
+ "\n",
169
+ " if plot:\n",
170
+ " plt.figure(figsize=(12, 6))\n",
171
+ " plt.plot(image_1_values, label='image_1_values', color = 'red')\n",
172
+ " plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', color = 'blue')\n",
173
+ " # plt.plot(image_1_matrix, label='image_1_matrix', linestyle='--', color = 'blue')\n",
174
+ " # plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', linestyle='--', color = \"green\")\n",
175
+ " plt.show()\n",
176
+ "\n",
177
+ " return {\n",
178
+ " 'old_pearsonr' : f'{round(_old_pearsonr.statistic, 4)} - {_old_pearsonr.pvalue}',\n",
179
+ " 'old_cosine_similarity' : round(_old_cosine_similarity, 4),\n",
180
+ " 'pearsonr' : f'{round(_pearsonr.statistic, 4)} - {_pearsonr.pvalue}',\n",
181
+ " 'cosine_similarity' : round(_cosine_similarity, 4),\n",
182
+ " 'pearsonr_smooth' : f'{round(_pearsonr_smooth.statistic, 4)} - {_pearsonr_smooth.pvalue}',\n",
183
+ " 'cosine_similarity_smooth' : round(_cosine_similarity_smooth, 4),\n",
184
+ " 'pearsonr_random' : f'{round(_pearsonr_random.statistic, 4)} - {_pearsonr_random.pvalue}',\n",
185
+ " 'cosine_similarity_random' : round(_cosine_similarity_random, 4),\n",
186
+ " }\n"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": null,
192
+ "metadata": {},
193
+ "outputs": [],
194
+ "source": [
195
+ "get_stats(0)"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "with zipfile.ZipFile('./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/vectors_column.json.zip', \"r\") as zip_ref:\n",
205
+ " similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": null,
211
+ "metadata": {},
212
+ "outputs": [],
213
+ "source": [
214
+ "top_matches = [\n",
215
+ " similarity for similarity in \n",
216
+ " similarity_vectors_json \n",
217
+ " if similarity['cosine_similarity_score'] > 0.8 and \n",
218
+ " similarity['document_image_1'] != similarity['document_image_2']]\n",
219
+ "top_matches.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)"
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "execution_count": null,
225
+ "metadata": {},
226
+ "outputs": [],
227
+ "source": [
228
+ "def get_image(filename: str):\n",
229
+ " return Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{filename}')\n",
230
+ "\n",
231
+ "def print_matches(matches, two_column_count, *, start = 0):\n",
232
+ " images_range = range(start, start + two_column_count)\n",
233
+ " images = np.array(\n",
234
+ " [\n",
235
+ " [\n",
236
+ " get_image(matches[i]['document_image_1']), \n",
237
+ " get_image(matches[i]['document_image_2']),\n",
238
+ " draw_text_on_image(\n",
239
+ " Image.new(\"RGB\", (800, 1200), 'white'),\n",
240
+ " [100, 100],\n",
241
+ " json.dumps(\n",
242
+ " get_modified_stats(\n",
243
+ " int(matches[i]['document_image_1'].split('.')[0]), \n",
244
+ " int(matches[i]['document_image_2'].split('.')[0]), \n",
245
+ " 'vectors'), \n",
246
+ " indent=4),\n",
247
+ " label_text_size=40,\n",
248
+ " label_rectangle_color='white',\n",
249
+ " ),\n",
250
+ " ]\n",
251
+ " for i\n",
252
+ " in images_range\n",
253
+ " ],\n",
254
+ " dtype=\"object\").flatten().tolist()\n",
255
+ " titles = np.array(\n",
256
+ " [\n",
257
+ " [\n",
258
+ " f\"{matches[i]['document_image_1']}, Similarity - {round(matches[i]['cosine_similarity_score'], 4)}\", \n",
259
+ " matches[i]['document_image_2'],\n",
260
+ " 'More Statistics',\n",
261
+ " ]\n",
262
+ " for i\n",
263
+ " in images_range\n",
264
+ " ]).flatten().tolist()\n",
265
+ " width_parts = 3\n",
266
+ " return show_tile_images(\n",
267
+ " images,\n",
268
+ " titles = titles,\n",
269
+ " width_parts = width_parts,\n",
270
+ " figsize = (10.2 * width_parts, 12 * (len(images) / width_parts)),\n",
271
+ " space = 2,\n",
272
+ " pad = True,\n",
273
+ " figcolor = '#d3eddd',\n",
274
+ " title_color = 'black',\n",
275
+ " title_background_color = 'white',\n",
276
+ " title_font_size = 30)\n",
277
+ "\n",
278
+ "print_matches(top_matches, 2, start=0)"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": null,
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "almost_similar = [similarity for similarity in \n",
288
+ " similarity_vectors_json \n",
289
+ " if similarity['cosine_similarity_score'] > 0.9 and similarity['cosine_similarity_score'] < 1.0]\n",
290
+ "almost_similar.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)"
291
+ ]
292
+ },
293
+ {
294
+ "cell_type": "code",
295
+ "execution_count": null,
296
+ "metadata": {},
297
+ "outputs": [],
298
+ "source": [
299
+ "print_matches(almost_similar, 5, start=0)"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": null,
305
+ "metadata": {},
306
+ "outputs": [],
307
+ "source": [
308
+ "from app import app\n",
309
+ "\n",
310
+ "app()"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": null,
316
+ "metadata": {},
317
+ "outputs": [],
318
+ "source": [
319
+ "from utils.get_RGB_image import get_RGB_image\n",
320
+ "from pdf2image import convert_from_path\n",
321
+ "\n",
322
+ "pdf = convert_from_path('./sdfes.png', 140)"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": null,
328
+ "metadata": {},
329
+ "outputs": [],
330
+ "source": [
331
+ "get_RGB_image(pdf[0]) "
332
+ ]
333
+ }
334
+ ],
335
+ "metadata": {
336
+ "colab": {
337
+ "provenance": []
338
+ },
339
+ "kernelspec": {
340
+ "display_name": "Python 3",
341
+ "name": "python3"
342
+ },
343
+ "language_info": {
344
+ "codemirror_mode": {
345
+ "name": "ipython",
346
+ "version": 3
347
+ },
348
+ "file_extension": ".py",
349
+ "mimetype": "text/x-python",
350
+ "name": "python",
351
+ "nbconvert_exporter": "python",
352
+ "pygments_lexer": "ipython3",
353
+ "version": "3.10.13"
354
+ }
355
+ },
356
+ "nbformat": 4,
357
+ "nbformat_minor": 0
358
+ }