RE_UPLOAD-REBUILD-RESTART
Browse files- analysis.ipynb +539 -0
analysis.ipynb
ADDED
@@ -0,0 +1,539 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"id": "IsB9l3mBIGUN"
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"## Analysis"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": null,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [],
|
17 |
+
"source": [
|
18 |
+
"%load_ext autoreload\n",
|
19 |
+
"%autoreload 2"
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"cell_type": "code",
|
24 |
+
"execution_count": null,
|
25 |
+
"metadata": {},
|
26 |
+
"outputs": [],
|
27 |
+
"source": [
|
28 |
+
"import pandas as pd\n",
|
29 |
+
"from PIL import Image\n",
|
30 |
+
"from scipy.stats import pearsonr\n",
|
31 |
+
"from utils.get_unique_values import get_unique_values\n",
|
32 |
+
"from utils.remove_duplicates import unzip_fn\n",
|
33 |
+
"from utils.show_tile_images import show_tile_images\n",
|
34 |
+
"import zipfile\n",
|
35 |
+
"import json\n",
|
36 |
+
"from utils.visualize_bboxes_on_image import draw_text_on_image\n",
|
37 |
+
"import numpy as np\n",
|
38 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
39 |
+
"import matplotlib.pyplot as plt\n",
|
40 |
+
"import tqdm as tqdm\n",
|
41 |
+
"from functools import cache\n",
|
42 |
+
"from utils.flatten import flatten"
|
43 |
+
]
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"cell_type": "code",
|
47 |
+
"execution_count": null,
|
48 |
+
"metadata": {
|
49 |
+
"id": "5l6iv7ZrIGUP"
|
50 |
+
},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"# !GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features --depth=1\n",
|
54 |
+
"\n",
|
55 |
+
"# !wget https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features/resolve/main/data/processed/RVL-CDIP-invoice/vectors.json.zip -P ./data/processed/RVL-CDIP-invoice/\n",
|
56 |
+
"\n",
|
57 |
+
"\n",
|
58 |
+
"\n",
|
59 |
+
"# import sys\n",
|
60 |
+
"# sys.path.insert(0, './document-similarity-search-using-visual-layout-features')"
|
61 |
+
]
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"cell_type": "code",
|
65 |
+
"execution_count": null,
|
66 |
+
"metadata": {
|
67 |
+
"id": "172P8Ey8ytD9"
|
68 |
+
},
|
69 |
+
"outputs": [],
|
70 |
+
"source": [
|
71 |
+
"# import os\n",
|
72 |
+
"# vectors_chunks = os.listdir('/content/document-similarity-search-using-visual-layout-features/data/processed/RVL-CDIP-invoice/vectors.json.zip.chunks')\n",
|
73 |
+
"# vectors_chunks.sort(key=lambda x: int(x.split('-')[0]))\n",
|
74 |
+
"# vectors_chunks"
|
75 |
+
]
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"cell_type": "code",
|
79 |
+
"execution_count": null,
|
80 |
+
"metadata": {
|
81 |
+
"id": "ZZD9JBaWa_T_"
|
82 |
+
},
|
83 |
+
"outputs": [],
|
84 |
+
"source": [
|
85 |
+
"vectors_df = pd.read_json('./data/local-data/processed/RVL-CDIP-invoice/vectors.json.zip')\n",
|
86 |
+
"vectors_df"
|
87 |
+
]
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"cell_type": "code",
|
91 |
+
"execution_count": null,
|
92 |
+
"metadata": {},
|
93 |
+
"outputs": [],
|
94 |
+
"source": [
|
95 |
+
"# https://gemini.google.com/app/8cd4389df12d29e6\n",
|
96 |
+
"\n",
|
97 |
+
"# https://chat.openai.com/c/a345a9ec-9238-4089-a6c0-bb4d375148eb"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "markdown",
|
102 |
+
"metadata": {
|
103 |
+
"id": "X0n7rBnZIGUQ"
|
104 |
+
},
|
105 |
+
"source": [
|
106 |
+
"### Correlation"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": null,
|
112 |
+
"metadata": {},
|
113 |
+
"outputs": [],
|
114 |
+
"source": [
|
115 |
+
"unique_values = get_unique_values(start=0.17, end=1, count=10*1000)\n",
|
116 |
+
"\n",
|
117 |
+
"def get_stats(index: int):\n",
|
118 |
+
" vectors = vectors_df.loc[index, 'vectors']\n",
|
119 |
+
" weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n",
|
120 |
+
" reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n",
|
121 |
+
" reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n",
|
122 |
+
" non_zero_vectors, non_zero_uniques = unzip_fn([(vector, unique) for vector, unique in zip(vectors, unique_values) if vector > 0]) if len([i for i in vectors if i > 0]) > 0 else ([], [])\n",
|
123 |
+
"\n",
|
124 |
+
" non_zero_vectors__uniques = pearsonr(non_zero_vectors, non_zero_uniques) if len(non_zero_vectors) > 0 else [0,1]\n",
|
125 |
+
" vectors___unique_values = pearsonr(vectors, unique_values)\n",
|
126 |
+
" vectors___weighted_vectors = pearsonr(vectors, weighted_vectors)\n",
|
127 |
+
" vectors___reduced_vectors = pearsonr(vectors, reduced_vectors)\n",
|
128 |
+
" vectors___reduced_weighted_vectors = pearsonr(vectors, reduced_weighted_vectors)\n",
|
129 |
+
" weighted_vectors___reduced_vectors = pearsonr(weighted_vectors, reduced_vectors)\n",
|
130 |
+
" weighted_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
|
131 |
+
" reduced_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
|
132 |
+
"\n",
|
133 |
+
" return {\n",
|
134 |
+
" 'non_zero_vectors__uniques': non_zero_vectors__uniques,\n",
|
135 |
+
" 'vectors___unique_values': vectors___unique_values,\n",
|
136 |
+
" 'vectors___weighted_vectors': vectors___weighted_vectors,\n",
|
137 |
+
" 'vectors___reduced_vectors': vectors___reduced_vectors,\n",
|
138 |
+
" 'vectors___reduced_weighted_vectors': vectors___reduced_weighted_vectors,\n",
|
139 |
+
" 'weighted_vectors___reduced_vectors': weighted_vectors___reduced_vectors,\n",
|
140 |
+
" 'weighted_vectors___reduced_weighted_vectors': weighted_vectors___reduced_weighted_vectors,\n",
|
141 |
+
" 'reduced_vectors___reduced_weighted_vectors': reduced_vectors___reduced_weighted_vectors,\n",
|
142 |
+
" }\n",
|
143 |
+
"\n",
|
144 |
+
"from matplotlib import pyplot as plt\n",
|
145 |
+
"from scipy.signal import convolve\n",
|
146 |
+
"kernel = np.array([0.25, 0.5, 0.25]) # Example kernel for simple averaging\n",
|
147 |
+
"\n",
|
148 |
+
"def smooth_vector(vector):\n",
|
149 |
+
" # Perform convolution\n",
|
150 |
+
" smoothed_vector = convolve(vector, kernel, mode='same') / sum(kernel)\n",
|
151 |
+
" return smoothed_vector\n",
|
152 |
+
"\n",
|
153 |
+
"def get_modified_stats(image_1_index: int, image_2_index: int, vector_column: str = 'vectors', plot = False):\n",
|
154 |
+
" image_1_values = vectors_df.loc[image_1_index, vector_column]\n",
|
155 |
+
" image_2_values = vectors_df.loc[image_2_index, vector_column]\n",
|
156 |
+
"\n",
|
157 |
+
" image_1_matrix = np.array(image_1_values)\n",
|
158 |
+
" image_2_matrix = np.array(image_2_values)\n",
|
159 |
+
"\n",
|
160 |
+
" vector_1_zero_indices = image_1_matrix == 0\n",
|
161 |
+
" vector_2_zero_indices = image_2_matrix == 0\n",
|
162 |
+
"\n",
|
163 |
+
" image_1_matrix[vector_1_zero_indices] = unique_values[vector_1_zero_indices]\n",
|
164 |
+
" image_2_matrix[vector_2_zero_indices] = unique_values[vector_2_zero_indices]\n",
|
165 |
+
"\n",
|
166 |
+
" _old_pearsonr = pearsonr(image_1_values, image_2_values)\n",
|
167 |
+
" [[_old_cosine_similarity]] = cosine_similarity([image_1_values], [image_2_values])\n",
|
168 |
+
" _pearsonr = pearsonr(image_1_matrix, image_2_matrix)\n",
|
169 |
+
" [[_cosine_similarity]] = cosine_similarity([image_1_matrix], [image_2_matrix])\n",
|
170 |
+
"\n",
|
171 |
+
" image_1_matrix_smooth = smooth_vector(image_1_matrix)\n",
|
172 |
+
" image_2_matrix_smooth = smooth_vector(image_2_matrix)\n",
|
173 |
+
" _pearsonr_smooth = pearsonr(image_1_matrix_smooth, image_2_matrix)\n",
|
174 |
+
" [[_cosine_similarity_smooth]] = cosine_similarity([image_1_matrix_smooth], [image_2_matrix])\n",
|
175 |
+
"\n",
|
176 |
+
" permuted_indices = np.random.permutation(len(image_1_matrix))\n",
|
177 |
+
" _pearsonr_random = pearsonr(image_1_matrix[permuted_indices], image_2_matrix[permuted_indices])\n",
|
178 |
+
" [[_cosine_similarity_random]] = cosine_similarity([image_1_matrix[permuted_indices]], [image_2_matrix[permuted_indices]])\n",
|
179 |
+
"\n",
|
180 |
+
" if plot:\n",
|
181 |
+
" plt.figure(figsize=(12, 6))\n",
|
182 |
+
" plt.plot(image_1_values, label='image_1_values', color = 'red')\n",
|
183 |
+
" plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', color = 'blue')\n",
|
184 |
+
" # plt.plot(image_1_matrix, label='image_1_matrix', linestyle='--', color = 'blue')\n",
|
185 |
+
" # plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', linestyle='--', color = \"green\")\n",
|
186 |
+
" plt.show()\n",
|
187 |
+
"\n",
|
188 |
+
" return {\n",
|
189 |
+
" 'old_pearsonr' : f'{round(_old_pearsonr.statistic, 4)} - {_old_pearsonr.pvalue}',\n",
|
190 |
+
" 'old_cosine_similarity' : round(_old_cosine_similarity, 4),\n",
|
191 |
+
" 'pearsonr' : f'{round(_pearsonr.statistic, 4)} - {_pearsonr.pvalue}',\n",
|
192 |
+
" 'cosine_similarity' : round(_cosine_similarity, 4),\n",
|
193 |
+
" 'pearsonr_smooth' : f'{round(_pearsonr_smooth.statistic, 4)} - {_pearsonr_smooth.pvalue}',\n",
|
194 |
+
" 'cosine_similarity_smooth' : round(_cosine_similarity_smooth, 4),\n",
|
195 |
+
" 'pearsonr_random' : f'{round(_pearsonr_random.statistic, 4)} - {_pearsonr_random.pvalue}',\n",
|
196 |
+
" 'cosine_similarity_random' : round(_cosine_similarity_random, 4),\n",
|
197 |
+
" }\n"
|
198 |
+
]
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"cell_type": "code",
|
202 |
+
"execution_count": null,
|
203 |
+
"metadata": {},
|
204 |
+
"outputs": [],
|
205 |
+
"source": [
|
206 |
+
"get_stats(19569)"
|
207 |
+
]
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"cell_type": "code",
|
211 |
+
"execution_count": null,
|
212 |
+
"metadata": {},
|
213 |
+
"outputs": [],
|
214 |
+
"source": [
|
215 |
+
"correlation_results = []\n",
|
216 |
+
"for i in tqdm.tqdm(range(len(correlation_results), len(vectors_df))):\n",
|
217 |
+
" correlation_results.append(get_stats(i))"
|
218 |
+
]
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"cell_type": "code",
|
222 |
+
"execution_count": null,
|
223 |
+
"metadata": {},
|
224 |
+
"outputs": [],
|
225 |
+
"source": [
|
226 |
+
"columns = list(correlation_results[0].keys())\n",
|
227 |
+
"fig, axes = plt.subplots(4, 2, figsize=(12, 12))\n",
|
228 |
+
"axes = axes.flatten()\n",
|
229 |
+
"for i, column in enumerate(columns):\n",
|
230 |
+
" ax = axes[i]\n",
|
231 |
+
" ax.hist([j[column][0] for j in correlation_results], bins=100)\n",
|
232 |
+
" ax.set_title(column)"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"cell_type": "code",
|
237 |
+
"execution_count": null,
|
238 |
+
"metadata": {},
|
239 |
+
"outputs": [],
|
240 |
+
"source": [
|
241 |
+
"def correlation_fn(index: int):\n",
|
242 |
+
" vectors = vectors_df.loc[index, 'vectors']\n",
|
243 |
+
" weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n",
|
244 |
+
" reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n",
|
245 |
+
" reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n",
|
246 |
+
" return {\n",
|
247 |
+
" 'vectors vs weighted_vectors': pearsonr(vectors, weighted_vectors),\n",
|
248 |
+
" 'vectors vs reduced_vectors': pearsonr(vectors, reduced_vectors),\n",
|
249 |
+
" 'vectors vs reduced_weighted_vectors': pearsonr(vectors, reduced_weighted_vectors),\n",
|
250 |
+
" 'weighted_vectors vs reduced_vectors': pearsonr(weighted_vectors, reduced_vectors),\n",
|
251 |
+
" 'weighted_vectors vs reduced_weighted_vectors': pearsonr(weighted_vectors, reduced_weighted_vectors),\n",
|
252 |
+
" 'reduced_vectors vs reduced_weighted_vectors': pearsonr(reduced_vectors, reduced_weighted_vectors),\n",
|
253 |
+
" }\n",
|
254 |
+
"\n",
|
255 |
+
"correlation_results_2 = [correlation_fn(i) for i in tqdm.tqdm(range(len(vectors_df)))]"
|
256 |
+
]
|
257 |
+
},
|
258 |
+
{
|
259 |
+
"cell_type": "code",
|
260 |
+
"execution_count": null,
|
261 |
+
"metadata": {},
|
262 |
+
"outputs": [],
|
263 |
+
"source": [
|
264 |
+
"import matplotlib.pyplot as plt\n",
|
265 |
+
"\n",
|
266 |
+
"columns = list(correlation_results_2[0].keys())\n",
|
267 |
+
"fig, axes = plt.subplots(6, 2, figsize=(24, 24))\n",
|
268 |
+
"axes = axes.flatten()\n",
|
269 |
+
"for i, column in enumerate(columns):\n",
|
270 |
+
" ax = axes[i]\n",
|
271 |
+
" corr = [j[column][0] for j in correlation_results_2]\n",
|
272 |
+
" pvalues = [j[column][1] for j in correlation_results_2]\n",
|
273 |
+
" # ax.hist([j[column][0] for j in correlation_results_2], bins=100)\n",
|
274 |
+
" ax.plot(range(0, len(corr)), corr, label='Correlation', color='blue')\n",
|
275 |
+
" # ax.plot(range(0, len(pvalues)), pvalues, label='pvalues', color='red')\n",
|
276 |
+
" ax.set_title(column)"
|
277 |
+
]
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"cell_type": "code",
|
281 |
+
"execution_count": null,
|
282 |
+
"metadata": {},
|
283 |
+
"outputs": [],
|
284 |
+
"source": [
|
285 |
+
"import matplotlib.pyplot as plt\n",
|
286 |
+
"\n",
|
287 |
+
"columns = list(correlation_results_2[0].keys())\n",
|
288 |
+
"fig, axes = plt.subplots(3, 2, figsize=(24, 24))\n",
|
289 |
+
"axes = axes.flatten()\n",
|
290 |
+
"for i, column in enumerate(columns):\n",
|
291 |
+
" ax = axes[i]\n",
|
292 |
+
" corr = [j[column][0] for j in correlation_results_2]\n",
|
293 |
+
" pvalues = [j[column][1] for j in correlation_results_2]\n",
|
294 |
+
" ax.plot(range(0, len(corr)), corr, label='correlation', color='blue')\n",
|
295 |
+
" ax.plot(range(0, len(pvalues)), pvalues, label='p-value', color='red')\n",
|
296 |
+
" ax.legend(bbox_to_anchor=(1, 0.1), loc='lower right')\n",
|
297 |
+
" ax.set_ylabel('correlation & p-value')\n",
|
298 |
+
" ax.set_xlabel(f'images - {column}')\n",
|
299 |
+
" ax.set_title(column)\n",
|
300 |
+
"\n",
|
301 |
+
"fig.savefig('/Users/charleskabue/Downloads/vector-correlations.png')"
|
302 |
+
]
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"cell_type": "markdown",
|
306 |
+
"metadata": {},
|
307 |
+
"source": [
|
308 |
+
"<hr/>"
|
309 |
+
]
|
310 |
+
},
|
311 |
+
{
|
312 |
+
"cell_type": "code",
|
313 |
+
"execution_count": null,
|
314 |
+
"metadata": {},
|
315 |
+
"outputs": [],
|
316 |
+
"source": [
|
317 |
+
"# vector_columns = ['vectors_column', 'weighted_vectors_column', 'reduced_vectors_column', 'reduced_weighted_vectors_column']\n",
|
318 |
+
"# similarities_json = {}\n",
|
319 |
+
"# for vector_column in tqdm.tqdm(vector_columns):\n",
|
320 |
+
"# with zipfile.ZipFile(f'./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/{vector_column}.json.zip', \"r\") as zip_ref:\n",
|
321 |
+
"# similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))\n",
|
322 |
+
"# similarities_json[vector_column] = similarity_vectors_json\n",
|
323 |
+
"@cache\n",
|
324 |
+
"def get_similarities(filter, vector_column: str = 'vectors_column'):\n",
|
325 |
+
" with zipfile.ZipFile(f'./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/{vector_column}.json.zip', \"r\") as zip_ref:\n",
|
326 |
+
" similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))\n",
|
327 |
+
" results = [value for value in tqdm.tqdm(similarity_vectors_json) if (filter(value) if filter else True)]\n",
|
328 |
+
" results.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)\n",
|
329 |
+
" similarity_vectors_json = None\n",
|
330 |
+
" return results"
|
331 |
+
]
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"cell_type": "code",
|
335 |
+
"execution_count": null,
|
336 |
+
"metadata": {},
|
337 |
+
"outputs": [],
|
338 |
+
"source": []
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"cell_type": "code",
|
342 |
+
"execution_count": null,
|
343 |
+
"metadata": {},
|
344 |
+
"outputs": [],
|
345 |
+
"source": [
|
346 |
+
"duplicates_matches = get_similarities(\n",
|
347 |
+
" lambda similarity: similarity['cosine_similarity_score'] < 1 and similarity['document_image_1'] == similarity['document_image_2'], \n",
|
348 |
+
" 'reduced_weighted_vectors_column')\n",
|
349 |
+
"\n",
|
350 |
+
"len(duplicates_matches)"
|
351 |
+
]
|
352 |
+
},
|
353 |
+
{
|
354 |
+
"cell_type": "code",
|
355 |
+
"execution_count": null,
|
356 |
+
"metadata": {},
|
357 |
+
"outputs": [],
|
358 |
+
"source": [
|
359 |
+
"top_matches = get_similarities(\n",
|
360 |
+
" lambda similarity: similarity['cosine_similarity_score'] > 0.8 and similarity['document_image_1'] != similarity['document_image_2'], \n",
|
361 |
+
" 'reduced_weighted_vectors_column')"
|
362 |
+
]
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"cell_type": "code",
|
366 |
+
"execution_count": null,
|
367 |
+
"metadata": {},
|
368 |
+
"outputs": [],
|
369 |
+
"source": [
|
370 |
+
"def get_image(filename: str):\n",
|
371 |
+
" return Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{filename}')\n",
|
372 |
+
"\n",
|
373 |
+
"def print_matches(matches, *, per_side = 1, figsize = None, startistics = True):\n",
|
374 |
+
" images = [\n",
|
375 |
+
" [\n",
|
376 |
+
" get_image(match['document_image_1']), \n",
|
377 |
+
" get_image(match['document_image_2']),\n",
|
378 |
+
" \n",
|
379 |
+
" ] + ([\n",
|
380 |
+
" draw_text_on_image(\n",
|
381 |
+
" Image.new(\"RGB\", (800, 1200), 'white'),\n",
|
382 |
+
" [100, 100],\n",
|
383 |
+
" json.dumps(\n",
|
384 |
+
" get_modified_stats(\n",
|
385 |
+
" int(match['document_image_1'].split('.')[0]), \n",
|
386 |
+
" int(match['document_image_2'].split('.')[0]), \n",
|
387 |
+
" 'vectors'), \n",
|
388 |
+
" indent=4),\n",
|
389 |
+
" label_text_size=40,\n",
|
390 |
+
" label_fill_color='white')\n",
|
391 |
+
" ] if startistics else [])\n",
|
392 |
+
" for match\n",
|
393 |
+
" in matches\n",
|
394 |
+
" ]\n",
|
395 |
+
" titles = [\n",
|
396 |
+
" [\n",
|
397 |
+
" f\"{match['document_image_1']}, Similarity - {round(match['cosine_similarity_score'], 4)}\" if startistics else match['document_image_1'],\n",
|
398 |
+
" match['document_image_2'],\n",
|
399 |
+
" ] + (['More Statistics'] if startistics else [])\n",
|
400 |
+
" for match\n",
|
401 |
+
" in matches\n",
|
402 |
+
" ]\n",
|
403 |
+
" width_parts = len(images[0]) * per_side\n",
|
404 |
+
" tile_image = show_tile_images(\n",
|
405 |
+
" images = flatten(images),\n",
|
406 |
+
" titles = flatten(titles),\n",
|
407 |
+
" width_parts = width_parts,\n",
|
408 |
+
" figsize = figsize or (10.2 * width_parts, 30 * (len(images) / width_parts)),\n",
|
409 |
+
" space = 2,\n",
|
410 |
+
" pad = True,\n",
|
411 |
+
" figcolor = '#d3eddd',\n",
|
412 |
+
" title_color = 'white',\n",
|
413 |
+
" title_background_color = 'black',\n",
|
414 |
+
" title_font_size = 25)\n",
|
415 |
+
" return tile_image\n",
|
416 |
+
"\n",
|
417 |
+
"len([i for i in top_matches if i['cosine_similarity_score'] >= 1])"
|
418 |
+
]
|
419 |
+
},
|
420 |
+
{
|
421 |
+
"cell_type": "code",
|
422 |
+
"execution_count": null,
|
423 |
+
"metadata": {},
|
424 |
+
"outputs": [],
|
425 |
+
"source": [
|
426 |
+
"print_matches(top_matches[0:28])"
|
427 |
+
]
|
428 |
+
},
|
429 |
+
{
|
430 |
+
"cell_type": "code",
|
431 |
+
"execution_count": null,
|
432 |
+
"metadata": {},
|
433 |
+
"outputs": [],
|
434 |
+
"source": [
|
435 |
+
"index = 44\n",
|
436 |
+
"print(top_matches[index]['document_image_1'] + ' - ' + top_matches[index]['document_image_2'])\n",
|
437 |
+
"draw_text_on_image(\n",
|
438 |
+
" print_matches([top_matches[index]], figsize=(10, 7)),\n",
|
439 |
+
" [330, 335],\n",
|
440 |
+
" f\"cosine similarity - {round(top_matches[index]['cosine_similarity_score'], 4)}\",\n",
|
441 |
+
" label_text_size=30,\n",
|
442 |
+
" label_fill_color='black',\n",
|
443 |
+
" label_text_color='white',\n",
|
444 |
+
" label_rotate_angle = 90,\n",
|
445 |
+
" label_text_padding = 2\n",
|
446 |
+
")"
|
447 |
+
]
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"cell_type": "code",
|
451 |
+
"execution_count": null,
|
452 |
+
"metadata": {},
|
453 |
+
"outputs": [],
|
454 |
+
"source": [
|
455 |
+
"print(duplicates_matches[0])\n",
|
456 |
+
"print_matches(duplicates_matches[:10])"
|
457 |
+
]
|
458 |
+
},
|
459 |
+
{
|
460 |
+
"cell_type": "code",
|
461 |
+
"execution_count": null,
|
462 |
+
"metadata": {},
|
463 |
+
"outputs": [],
|
464 |
+
"source": [
|
465 |
+
"from main import app\n",
|
466 |
+
"import os\n",
|
467 |
+
"\n",
|
468 |
+
"model_path = '../detectron2-layout-parser/model_final.pth'\n",
|
469 |
+
"config_path = '../detectron2-layout-parser/config.yaml'\n",
|
470 |
+
"\n",
|
471 |
+
"examples = [f'./demo-examples/{filename}' for filename in os.listdir('./demo-examples/')]\n",
|
472 |
+
"app(model_path=model_path, config_path=config_path, examples=examples, debug=True)"
|
473 |
+
]
|
474 |
+
},
|
475 |
+
{
|
476 |
+
"cell_type": "code",
|
477 |
+
"execution_count": null,
|
478 |
+
"metadata": {},
|
479 |
+
"outputs": [],
|
480 |
+
"source": [
|
481 |
+
"import os\n",
|
482 |
+
"from PIL import Image\n",
|
483 |
+
"import layoutparser as lp\n",
|
484 |
+
"from utils.get_features import get_features\n",
|
485 |
+
"\n",
|
486 |
+
"documents = os.listdir('./data/local-data/raw/RVL-CDIP-invoice')\n",
|
487 |
+
"# model_path = './model/trained_model/model_final.pth'\n",
|
488 |
+
"# config_path = './model/trained_model/config.yaml'\n",
|
489 |
+
"model_path = '../detectron2-layout-parser/model_final.pth'\n",
|
490 |
+
"config_path = '../detectron2-layout-parser/config.yaml'\n",
|
491 |
+
"label_map = {0: 'Caption', 1: 'Footnote', 2: 'Formula', 3: 'List-item', \n",
|
492 |
+
" 4: 'Page-footer', 5: 'Page-header', 6: 'Picture', \n",
|
493 |
+
" 7: 'Section-header', 8: 'Table', 9: 'Text', 10: 'Title'}\n",
|
494 |
+
"model = lp.Detectron2LayoutModel(\n",
|
495 |
+
" config_path=config_path,\n",
|
496 |
+
" model_path=model_path,\n",
|
497 |
+
" label_map=label_map)\n",
|
498 |
+
"\n",
|
499 |
+
"for document in documents[0:1]:\n",
|
500 |
+
" features = get_features(\n",
|
501 |
+
" image=Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{document}'),\n",
|
502 |
+
" model=model,\n",
|
503 |
+
" label_names=list(label_map.values()),\n",
|
504 |
+
" width_parts=100,\n",
|
505 |
+
" height_parts=100)"
|
506 |
+
]
|
507 |
+
},
|
508 |
+
{
|
509 |
+
"cell_type": "markdown",
|
510 |
+
"metadata": {},
|
511 |
+
"source": [
|
512 |
+
"<hr/>"
|
513 |
+
]
|
514 |
+
}
|
515 |
+
],
|
516 |
+
"metadata": {
|
517 |
+
"colab": {
|
518 |
+
"provenance": []
|
519 |
+
},
|
520 |
+
"kernelspec": {
|
521 |
+
"display_name": "Python 3",
|
522 |
+
"name": "python3"
|
523 |
+
},
|
524 |
+
"language_info": {
|
525 |
+
"codemirror_mode": {
|
526 |
+
"name": "ipython",
|
527 |
+
"version": 3
|
528 |
+
},
|
529 |
+
"file_extension": ".py",
|
530 |
+
"mimetype": "text/x-python",
|
531 |
+
"name": "python",
|
532 |
+
"nbconvert_exporter": "python",
|
533 |
+
"pygments_lexer": "ipython3",
|
534 |
+
"version": "3.10.13"
|
535 |
+
}
|
536 |
+
},
|
537 |
+
"nbformat": 4,
|
538 |
+
"nbformat_minor": 0
|
539 |
+
}
|