mckabue commited on
Commit
f477039
·
verified ·
1 Parent(s): 9082298

RE_UPLOAD-REBUILD-RESTART

Browse files
Files changed (1) hide show
  1. analysis.ipynb +539 -0
analysis.ipynb ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "IsB9l3mBIGUN"
7
+ },
8
+ "source": [
9
+ "## Analysis"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "%load_ext autoreload\n",
19
+ "%autoreload 2"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "import pandas as pd\n",
29
+ "from PIL import Image\n",
30
+ "from scipy.stats import pearsonr\n",
31
+ "from utils.get_unique_values import get_unique_values\n",
32
+ "from utils.remove_duplicates import unzip_fn\n",
33
+ "from utils.show_tile_images import show_tile_images\n",
34
+ "import zipfile\n",
35
+ "import json\n",
36
+ "from utils.visualize_bboxes_on_image import draw_text_on_image\n",
37
+ "import numpy as np\n",
38
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
39
+ "import matplotlib.pyplot as plt\n",
40
+ "import tqdm as tqdm\n",
41
+ "from functools import cache\n",
42
+ "from utils.flatten import flatten"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "metadata": {
49
+ "id": "5l6iv7ZrIGUP"
50
+ },
51
+ "outputs": [],
52
+ "source": [
53
+ "# !GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features --depth=1\n",
54
+ "\n",
55
+ "# !wget https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features/resolve/main/data/processed/RVL-CDIP-invoice/vectors.json.zip -P ./data/processed/RVL-CDIP-invoice/\n",
56
+ "\n",
57
+ "\n",
58
+ "\n",
59
+ "# import sys\n",
60
+ "# sys.path.insert(0, './document-similarity-search-using-visual-layout-features')"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "metadata": {
67
+ "id": "172P8Ey8ytD9"
68
+ },
69
+ "outputs": [],
70
+ "source": [
71
+ "# import os\n",
72
+ "# vectors_chunks = os.listdir('/content/document-similarity-search-using-visual-layout-features/data/processed/RVL-CDIP-invoice/vectors.json.zip.chunks')\n",
73
+ "# vectors_chunks.sort(key=lambda x: int(x.split('-')[0]))\n",
74
+ "# vectors_chunks"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": null,
80
+ "metadata": {
81
+ "id": "ZZD9JBaWa_T_"
82
+ },
83
+ "outputs": [],
84
+ "source": [
85
+ "vectors_df = pd.read_json('./data/local-data/processed/RVL-CDIP-invoice/vectors.json.zip')\n",
86
+ "vectors_df"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "# https://gemini.google.com/app/8cd4389df12d29e6\n",
96
+ "\n",
97
+ "# https://chat.openai.com/c/a345a9ec-9238-4089-a6c0-bb4d375148eb"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "markdown",
102
+ "metadata": {
103
+ "id": "X0n7rBnZIGUQ"
104
+ },
105
+ "source": [
106
+ "### Correlation"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": null,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "unique_values = get_unique_values(start=0.17, end=1, count=10*1000)\n",
116
+ "\n",
117
+ "def get_stats(index: int):\n",
118
+ " vectors = vectors_df.loc[index, 'vectors']\n",
119
+ " weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n",
120
+ " reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n",
121
+ " reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n",
122
+ " non_zero_vectors, non_zero_uniques = unzip_fn([(vector, unique) for vector, unique in zip(vectors, unique_values) if vector > 0]) if len([i for i in vectors if i > 0]) > 0 else ([], [])\n",
123
+ "\n",
124
+ " non_zero_vectors__uniques = pearsonr(non_zero_vectors, non_zero_uniques) if len(non_zero_vectors) > 0 else [0,1]\n",
125
+ " vectors___unique_values = pearsonr(vectors, unique_values)\n",
126
+ " vectors___weighted_vectors = pearsonr(vectors, weighted_vectors)\n",
127
+ " vectors___reduced_vectors = pearsonr(vectors, reduced_vectors)\n",
128
+ " vectors___reduced_weighted_vectors = pearsonr(vectors, reduced_weighted_vectors)\n",
129
+ " weighted_vectors___reduced_vectors = pearsonr(weighted_vectors, reduced_vectors)\n",
130
+ " weighted_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
131
+ " reduced_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
132
+ "\n",
133
+ " return {\n",
134
+ " 'non_zero_vectors__uniques': non_zero_vectors__uniques,\n",
135
+ " 'vectors___unique_values': vectors___unique_values,\n",
136
+ " 'vectors___weighted_vectors': vectors___weighted_vectors,\n",
137
+ " 'vectors___reduced_vectors': vectors___reduced_vectors,\n",
138
+ " 'vectors___reduced_weighted_vectors': vectors___reduced_weighted_vectors,\n",
139
+ " 'weighted_vectors___reduced_vectors': weighted_vectors___reduced_vectors,\n",
140
+ " 'weighted_vectors___reduced_weighted_vectors': weighted_vectors___reduced_weighted_vectors,\n",
141
+ " 'reduced_vectors___reduced_weighted_vectors': reduced_vectors___reduced_weighted_vectors,\n",
142
+ " }\n",
143
+ "\n",
144
+ "from matplotlib import pyplot as plt\n",
145
+ "from scipy.signal import convolve\n",
146
+ "kernel = np.array([0.25, 0.5, 0.25]) # Example kernel for simple averaging\n",
147
+ "\n",
148
+ "def smooth_vector(vector):\n",
149
+ " # Perform convolution\n",
150
+ " smoothed_vector = convolve(vector, kernel, mode='same') / sum(kernel)\n",
151
+ " return smoothed_vector\n",
152
+ "\n",
153
+ "def get_modified_stats(image_1_index: int, image_2_index: int, vector_column: str = 'vectors', plot = False):\n",
154
+ " image_1_values = vectors_df.loc[image_1_index, vector_column]\n",
155
+ " image_2_values = vectors_df.loc[image_2_index, vector_column]\n",
156
+ "\n",
157
+ " image_1_matrix = np.array(image_1_values)\n",
158
+ " image_2_matrix = np.array(image_2_values)\n",
159
+ "\n",
160
+ " vector_1_zero_indices = image_1_matrix == 0\n",
161
+ " vector_2_zero_indices = image_2_matrix == 0\n",
162
+ "\n",
163
+ " image_1_matrix[vector_1_zero_indices] = unique_values[vector_1_zero_indices]\n",
164
+ " image_2_matrix[vector_2_zero_indices] = unique_values[vector_2_zero_indices]\n",
165
+ "\n",
166
+ " _old_pearsonr = pearsonr(image_1_values, image_2_values)\n",
167
+ " [[_old_cosine_similarity]] = cosine_similarity([image_1_values], [image_2_values])\n",
168
+ " _pearsonr = pearsonr(image_1_matrix, image_2_matrix)\n",
169
+ " [[_cosine_similarity]] = cosine_similarity([image_1_matrix], [image_2_matrix])\n",
170
+ "\n",
171
+ " image_1_matrix_smooth = smooth_vector(image_1_matrix)\n",
172
+ " image_2_matrix_smooth = smooth_vector(image_2_matrix)\n",
173
+ " _pearsonr_smooth = pearsonr(image_1_matrix_smooth, image_2_matrix)\n",
174
+ " [[_cosine_similarity_smooth]] = cosine_similarity([image_1_matrix_smooth], [image_2_matrix])\n",
175
+ "\n",
176
+ " permuted_indices = np.random.permutation(len(image_1_matrix))\n",
177
+ " _pearsonr_random = pearsonr(image_1_matrix[permuted_indices], image_2_matrix[permuted_indices])\n",
178
+ " [[_cosine_similarity_random]] = cosine_similarity([image_1_matrix[permuted_indices]], [image_2_matrix[permuted_indices]])\n",
179
+ "\n",
180
+ " if plot:\n",
181
+ " plt.figure(figsize=(12, 6))\n",
182
+ " plt.plot(image_1_values, label='image_1_values', color = 'red')\n",
183
+ " plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', color = 'blue')\n",
184
+ " # plt.plot(image_1_matrix, label='image_1_matrix', linestyle='--', color = 'blue')\n",
185
+ " # plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', linestyle='--', color = \"green\")\n",
186
+ " plt.show()\n",
187
+ "\n",
188
+ " return {\n",
189
+ " 'old_pearsonr' : f'{round(_old_pearsonr.statistic, 4)} - {_old_pearsonr.pvalue}',\n",
190
+ " 'old_cosine_similarity' : round(_old_cosine_similarity, 4),\n",
191
+ " 'pearsonr' : f'{round(_pearsonr.statistic, 4)} - {_pearsonr.pvalue}',\n",
192
+ " 'cosine_similarity' : round(_cosine_similarity, 4),\n",
193
+ " 'pearsonr_smooth' : f'{round(_pearsonr_smooth.statistic, 4)} - {_pearsonr_smooth.pvalue}',\n",
194
+ " 'cosine_similarity_smooth' : round(_cosine_similarity_smooth, 4),\n",
195
+ " 'pearsonr_random' : f'{round(_pearsonr_random.statistic, 4)} - {_pearsonr_random.pvalue}',\n",
196
+ " 'cosine_similarity_random' : round(_cosine_similarity_random, 4),\n",
197
+ " }\n"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": null,
203
+ "metadata": {},
204
+ "outputs": [],
205
+ "source": [
206
+ "get_stats(19569)"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": null,
212
+ "metadata": {},
213
+ "outputs": [],
214
+ "source": [
215
+ "correlation_results = []\n",
216
+ "for i in tqdm.tqdm(range(len(correlation_results), len(vectors_df))):\n",
217
+ " correlation_results.append(get_stats(i))"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": null,
223
+ "metadata": {},
224
+ "outputs": [],
225
+ "source": [
226
+ "columns = list(correlation_results[0].keys())\n",
227
+ "fig, axes = plt.subplots(4, 2, figsize=(12, 12))\n",
228
+ "axes = axes.flatten()\n",
229
+ "for i, column in enumerate(columns):\n",
230
+ " ax = axes[i]\n",
231
+ " ax.hist([j[column][0] for j in correlation_results], bins=100)\n",
232
+ " ax.set_title(column)"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": null,
238
+ "metadata": {},
239
+ "outputs": [],
240
+ "source": [
241
+ "def correlation_fn(index: int):\n",
242
+ " vectors = vectors_df.loc[index, 'vectors']\n",
243
+ " weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n",
244
+ " reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n",
245
+ " reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n",
246
+ " return {\n",
247
+ " 'vectors vs weighted_vectors': pearsonr(vectors, weighted_vectors),\n",
248
+ " 'vectors vs reduced_vectors': pearsonr(vectors, reduced_vectors),\n",
249
+ " 'vectors vs reduced_weighted_vectors': pearsonr(vectors, reduced_weighted_vectors),\n",
250
+ " 'weighted_vectors vs reduced_vectors': pearsonr(weighted_vectors, reduced_vectors),\n",
251
+ " 'weighted_vectors vs reduced_weighted_vectors': pearsonr(weighted_vectors, reduced_weighted_vectors),\n",
252
+ " 'reduced_vectors vs reduced_weighted_vectors': pearsonr(reduced_vectors, reduced_weighted_vectors),\n",
253
+ " }\n",
254
+ "\n",
255
+ "correlation_results_2 = [correlation_fn(i) for i in tqdm.tqdm(range(len(vectors_df)))]"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": null,
261
+ "metadata": {},
262
+ "outputs": [],
263
+ "source": [
264
+ "import matplotlib.pyplot as plt\n",
265
+ "\n",
266
+ "columns = list(correlation_results_2[0].keys())\n",
267
+ "fig, axes = plt.subplots(6, 2, figsize=(24, 24))\n",
268
+ "axes = axes.flatten()\n",
269
+ "for i, column in enumerate(columns):\n",
270
+ " ax = axes[i]\n",
271
+ " corr = [j[column][0] for j in correlation_results_2]\n",
272
+ " pvalues = [j[column][1] for j in correlation_results_2]\n",
273
+ " # ax.hist([j[column][0] for j in correlation_results_2], bins=100)\n",
274
+ " ax.plot(range(0, len(corr)), corr, label='Correlation', color='blue')\n",
275
+ " # ax.plot(range(0, len(pvalues)), pvalues, label='pvalues', color='red')\n",
276
+ " ax.set_title(column)"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": null,
282
+ "metadata": {},
283
+ "outputs": [],
284
+ "source": [
285
+ "import matplotlib.pyplot as plt\n",
286
+ "\n",
287
+ "columns = list(correlation_results_2[0].keys())\n",
288
+ "fig, axes = plt.subplots(3, 2, figsize=(24, 24))\n",
289
+ "axes = axes.flatten()\n",
290
+ "for i, column in enumerate(columns):\n",
291
+ " ax = axes[i]\n",
292
+ " corr = [j[column][0] for j in correlation_results_2]\n",
293
+ " pvalues = [j[column][1] for j in correlation_results_2]\n",
294
+ " ax.plot(range(0, len(corr)), corr, label='correlation', color='blue')\n",
295
+ " ax.plot(range(0, len(pvalues)), pvalues, label='p-value', color='red')\n",
296
+ " ax.legend(bbox_to_anchor=(1, 0.1), loc='lower right')\n",
297
+ " ax.set_ylabel('correlation & p-value')\n",
298
+ " ax.set_xlabel(f'images - {column}')\n",
299
+ " ax.set_title(column)\n",
300
+ "\n",
301
+ "fig.savefig('/Users/charleskabue/Downloads/vector-correlations.png')"
302
+ ]
303
+ },
304
+ {
305
+ "cell_type": "markdown",
306
+ "metadata": {},
307
+ "source": [
308
+ "<hr/>"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "code",
313
+ "execution_count": null,
314
+ "metadata": {},
315
+ "outputs": [],
316
+ "source": [
317
+ "# vector_columns = ['vectors_column', 'weighted_vectors_column', 'reduced_vectors_column', 'reduced_weighted_vectors_column']\n",
318
+ "# similarities_json = {}\n",
319
+ "# for vector_column in tqdm.tqdm(vector_columns):\n",
320
+ "# with zipfile.ZipFile(f'./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/{vector_column}.json.zip', \"r\") as zip_ref:\n",
321
+ "# similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))\n",
322
+ "# similarities_json[vector_column] = similarity_vectors_json\n",
323
+ "@cache\n",
324
+ "def get_similarities(filter, vector_column: str = 'vectors_column'):\n",
325
+ " with zipfile.ZipFile(f'./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/{vector_column}.json.zip', \"r\") as zip_ref:\n",
326
+ " similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))\n",
327
+ " results = [value for value in tqdm.tqdm(similarity_vectors_json) if (filter(value) if filter else True)]\n",
328
+ " results.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)\n",
329
+ " similarity_vectors_json = None\n",
330
+ " return results"
331
+ ]
332
+ },
333
+ {
334
+ "cell_type": "code",
335
+ "execution_count": null,
336
+ "metadata": {},
337
+ "outputs": [],
338
+ "source": []
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": null,
343
+ "metadata": {},
344
+ "outputs": [],
345
+ "source": [
346
+ "duplicates_matches = get_similarities(\n",
347
+ " lambda similarity: similarity['cosine_similarity_score'] < 1 and similarity['document_image_1'] == similarity['document_image_2'], \n",
348
+ " 'reduced_weighted_vectors_column')\n",
349
+ "\n",
350
+ "len(duplicates_matches)"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": null,
356
+ "metadata": {},
357
+ "outputs": [],
358
+ "source": [
359
+ "top_matches = get_similarities(\n",
360
+ " lambda similarity: similarity['cosine_similarity_score'] > 0.8 and similarity['document_image_1'] != similarity['document_image_2'], \n",
361
+ " 'reduced_weighted_vectors_column')"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": null,
367
+ "metadata": {},
368
+ "outputs": [],
369
+ "source": [
370
+ "def get_image(filename: str):\n",
371
+ " return Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{filename}')\n",
372
+ "\n",
373
+ "def print_matches(matches, *, per_side = 1, figsize = None, startistics = True):\n",
374
+ " images = [\n",
375
+ " [\n",
376
+ " get_image(match['document_image_1']), \n",
377
+ " get_image(match['document_image_2']),\n",
378
+ " \n",
379
+ " ] + ([\n",
380
+ " draw_text_on_image(\n",
381
+ " Image.new(\"RGB\", (800, 1200), 'white'),\n",
382
+ " [100, 100],\n",
383
+ " json.dumps(\n",
384
+ " get_modified_stats(\n",
385
+ " int(match['document_image_1'].split('.')[0]), \n",
386
+ " int(match['document_image_2'].split('.')[0]), \n",
387
+ " 'vectors'), \n",
388
+ " indent=4),\n",
389
+ " label_text_size=40,\n",
390
+ " label_fill_color='white')\n",
391
+ " ] if startistics else [])\n",
392
+ " for match\n",
393
+ " in matches\n",
394
+ " ]\n",
395
+ " titles = [\n",
396
+ " [\n",
397
+ " f\"{match['document_image_1']}, Similarity - {round(match['cosine_similarity_score'], 4)}\" if startistics else match['document_image_1'],\n",
398
+ " match['document_image_2'],\n",
399
+ " ] + (['More Statistics'] if startistics else [])\n",
400
+ " for match\n",
401
+ " in matches\n",
402
+ " ]\n",
403
+ " width_parts = len(images[0]) * per_side\n",
404
+ " tile_image = show_tile_images(\n",
405
+ " images = flatten(images),\n",
406
+ " titles = flatten(titles),\n",
407
+ " width_parts = width_parts,\n",
408
+ " figsize = figsize or (10.2 * width_parts, 30 * (len(images) / width_parts)),\n",
409
+ " space = 2,\n",
410
+ " pad = True,\n",
411
+ " figcolor = '#d3eddd',\n",
412
+ " title_color = 'white',\n",
413
+ " title_background_color = 'black',\n",
414
+ " title_font_size = 25)\n",
415
+ " return tile_image\n",
416
+ "\n",
417
+ "len([i for i in top_matches if i['cosine_similarity_score'] >= 1])"
418
+ ]
419
+ },
420
+ {
421
+ "cell_type": "code",
422
+ "execution_count": null,
423
+ "metadata": {},
424
+ "outputs": [],
425
+ "source": [
426
+ "print_matches(top_matches[0:28])"
427
+ ]
428
+ },
429
+ {
430
+ "cell_type": "code",
431
+ "execution_count": null,
432
+ "metadata": {},
433
+ "outputs": [],
434
+ "source": [
435
+ "index = 44\n",
436
+ "print(top_matches[index]['document_image_1'] + ' - ' + top_matches[index]['document_image_2'])\n",
437
+ "draw_text_on_image(\n",
438
+ " print_matches([top_matches[index]], figsize=(10, 7)),\n",
439
+ " [330, 335],\n",
440
+ " f\"cosine similarity - {round(top_matches[index]['cosine_similarity_score'], 4)}\",\n",
441
+ " label_text_size=30,\n",
442
+ " label_fill_color='black',\n",
443
+ " label_text_color='white',\n",
444
+ " label_rotate_angle = 90,\n",
445
+ " label_text_padding = 2\n",
446
+ ")"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": null,
452
+ "metadata": {},
453
+ "outputs": [],
454
+ "source": [
455
+ "print(duplicates_matches[0])\n",
456
+ "print_matches(duplicates_matches[:10])"
457
+ ]
458
+ },
459
+ {
460
+ "cell_type": "code",
461
+ "execution_count": null,
462
+ "metadata": {},
463
+ "outputs": [],
464
+ "source": [
465
+ "from main import app\n",
466
+ "import os\n",
467
+ "\n",
468
+ "model_path = '../detectron2-layout-parser/model_final.pth'\n",
469
+ "config_path = '../detectron2-layout-parser/config.yaml'\n",
470
+ "\n",
471
+ "examples = [f'./demo-examples/{filename}' for filename in os.listdir('./demo-examples/')]\n",
472
+ "app(model_path=model_path, config_path=config_path, examples=examples, debug=True)"
473
+ ]
474
+ },
475
+ {
476
+ "cell_type": "code",
477
+ "execution_count": null,
478
+ "metadata": {},
479
+ "outputs": [],
480
+ "source": [
481
+ "import os\n",
482
+ "from PIL import Image\n",
483
+ "import layoutparser as lp\n",
484
+ "from utils.get_features import get_features\n",
485
+ "\n",
486
+ "documents = os.listdir('./data/local-data/raw/RVL-CDIP-invoice')\n",
487
+ "# model_path = './model/trained_model/model_final.pth'\n",
488
+ "# config_path = './model/trained_model/config.yaml'\n",
489
+ "model_path = '../detectron2-layout-parser/model_final.pth'\n",
490
+ "config_path = '../detectron2-layout-parser/config.yaml'\n",
491
+ "label_map = {0: 'Caption', 1: 'Footnote', 2: 'Formula', 3: 'List-item', \n",
492
+ " 4: 'Page-footer', 5: 'Page-header', 6: 'Picture', \n",
493
+ " 7: 'Section-header', 8: 'Table', 9: 'Text', 10: 'Title'}\n",
494
+ "model = lp.Detectron2LayoutModel(\n",
495
+ " config_path=config_path,\n",
496
+ " model_path=model_path,\n",
497
+ " label_map=label_map)\n",
498
+ "\n",
499
+ "for document in documents[0:1]:\n",
500
+ " features = get_features(\n",
501
+ " image=Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{document}'),\n",
502
+ " model=model,\n",
503
+ " label_names=list(label_map.values()),\n",
504
+ " width_parts=100,\n",
505
+ " height_parts=100)"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "markdown",
510
+ "metadata": {},
511
+ "source": [
512
+ "<hr/>"
513
+ ]
514
+ }
515
+ ],
516
+ "metadata": {
517
+ "colab": {
518
+ "provenance": []
519
+ },
520
+ "kernelspec": {
521
+ "display_name": "Python 3",
522
+ "name": "python3"
523
+ },
524
+ "language_info": {
525
+ "codemirror_mode": {
526
+ "name": "ipython",
527
+ "version": 3
528
+ },
529
+ "file_extension": ".py",
530
+ "mimetype": "text/x-python",
531
+ "name": "python",
532
+ "nbconvert_exporter": "python",
533
+ "pygments_lexer": "ipython3",
534
+ "version": "3.10.13"
535
+ }
536
+ },
537
+ "nbformat": 4,
538
+ "nbformat_minor": 0
539
+ }