FoodDesert commited on
Commit
8b24305
1 Parent(s): b4bf2a9

Upload 2 files

Browse files

Trying to fix the mascot image again. Also uploading jupyter notebook used to construct the svd-reduced tf-idf matrix

Files changed (2) hide show
  1. app.py +40 -5
  2. predict_all_tags_from_dump.ipynb +721 -0
app.py CHANGED
@@ -22,8 +22,10 @@ import glob
22
  import itertools
23
  from itertools import islice
24
  from pathlib import Path
25
-
26
 
 
 
27
 
28
 
29
  faq_content="""
@@ -153,7 +155,7 @@ def extract_tags(tree):
153
  return tags_with_positions
154
 
155
 
156
- special_tags = ["score:0", "score:1", "score:2", "score:3", "score:4", "score:5", "score:6", "score:7", "score:8", "score:9"]
157
  def remove_special_tags(original_string):
158
  tags = [tag.strip() for tag in original_string.split(",")]
159
  remaining_tags = [tag for tag in tags if tag not in special_tags]
@@ -713,9 +715,42 @@ with gr.Blocks(css=css) as app:
713
  #gr.HTML('<div style="text-align: center;"><img src={image_path} alt="Cute Mascot" style="max-height: 100px; background: transparent;"></div><br>')
714
  #gr.HTML("<br>" * 2) # Adjust the number of line breaks ("<br>") as needed to push the button down
715
  #image_path = os.path.join('mascotimages', "transparentsquirrel.png")
716
- random_image_path = os.path.join('mascotimages', random.choice([f for f in os.listdir('mascotimages') if os.path.isfile(os.path.join('mascotimages', f))]))
717
- with Image.open(random_image_path) as img:
718
- gr.Image(value=img,show_label=False, show_download_button=False, show_share_button=False, height=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
719
  submit_button = gr.Button(variant="primary")
720
  with gr.Row():
721
  with gr.Column(scale=3):
 
22
  import itertools
23
  from itertools import islice
24
  from pathlib import Path
25
+ import logging
26
 
27
+ # Set up logging
28
+ logging.basicConfig(filename='error.log', level=logging.DEBUG, format='%(asctime)s %(levelname)s:%(message)s')
29
 
30
 
31
  faq_content="""
 
155
  return tags_with_positions
156
 
157
 
158
+ special_tags = ["score:0", "score:1", "score:2", "score:3", "score:4", "score:5", "score:6", "score:7", "score:8", "score:9", "rating:s", "rating:q", "rating:e"]
159
  def remove_special_tags(original_string):
160
  tags = [tag.strip() for tag in original_string.split(",")]
161
  remaining_tags = [tag for tag in tags if tag not in special_tags]
 
715
  #gr.HTML('<div style="text-align: center;"><img src={image_path} alt="Cute Mascot" style="max-height: 100px; background: transparent;"></div><br>')
716
  #gr.HTML("<br>" * 2) # Adjust the number of line breaks ("<br>") as needed to push the button down
717
  #image_path = os.path.join('mascotimages', "transparentsquirrel.png")
718
+ #random_image_path = os.path.join('mascotimages', random.choice([f for f in os.listdir('mascotimages') if os.path.isfile(os.path.join('mascotimages', f))]))
719
+ #with Image.open(random_image_path) as img:
720
+ # gr.Image(value=img,show_label=False, show_download_button=False, show_share_button=False, height=200)
721
+
722
+
723
+
724
+
725
+
726
+ try:
727
+ files = [f for f in os.listdir('mascotimages') if os.path.isfile(os.path.join('mascotimages', f))]
728
+ logging.debug(f"Mascot: Files in 'mascotimages': {files}") # Log the list of files found
729
+
730
+ if files:
731
+ random_image_path = os.path.join('mascotimages', random.choice(files))
732
+ logging.info(f"Mascot: random_image_path: {random_image_path}") # Log which file was chosen
733
+
734
+ # Open and display the image using Gradio
735
+ try:
736
+ with Image.open(random_image_path) as img:
737
+ logging.debug(f"Mascot: Opened image: {random_image_path}") # Confirm image is opened
738
+ gr.Image(value=img, show_label=False, show_download_button=False, show_share_button=False, height=200)
739
+ except Exception as e:
740
+ logging.error(f"Mascot: Failed to open or display the image: {e}") # Log if image fails to open or display
741
+ else:
742
+ logging.warning("Mascot: No files found in 'mascotimages' directory") # Log if no files are found
743
+
744
+ except Exception as e:
745
+ logging.error(f"Mascot: Error listing files in directory: {e}") # Log if there's an error listing the directory
746
+
747
+
748
+
749
+
750
+
751
+
752
+
753
+
754
  submit_button = gr.Button(variant="primary")
755
  with gr.Row():
756
  with gr.Column(scale=3):
predict_all_tags_from_dump.ipynb ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "55c95870",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import csv\n",
11
+ "import gzip\n",
12
+ "from math import log\n",
13
+ "from collections import Counter\n",
14
+ "from sys import maxsize\n",
15
+ "import numpy as np\n",
16
+ "import joblib\n",
17
+ "from collections import OrderedDict\n",
18
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
19
+ "from collections import defaultdict\n",
20
+ "import sys\n",
21
+ "from scipy.sparse import dok_matrix\n",
22
+ "from sklearn.preprocessing import normalize\n",
23
+ "from sklearn.decomposition import TruncatedSVD\n",
24
+ "\n",
25
+ "\n",
26
+ "\n",
27
+ "posts_file = 'posts-2024-04-14.csv.gz'\n",
28
+ "fluffyrock_tags_list_file = 'fluffyrock_3m.csv'\n",
29
+ "\n",
30
+ "\n",
31
+ "def extract_artist_names(file_path):\n",
32
+ " \"\"\"\n",
33
+ " Extract artist names from a CSV file where each row contains tag information,\n",
34
+ " and the first column contains the tag's name. Artist tags start with 'by_'.\n",
35
+ "\n",
36
+ " :param file_path: Path to the CSV file\n",
37
+ " :return: A set containing artist names without the 'by_' prefix\n",
38
+ " \"\"\"\n",
39
+ " artists = set()\n",
40
+ "\n",
41
+ " # Open the CSV file and read it\n",
42
+ " with open(file_path, newline='', encoding='utf-8') as csvfile:\n",
43
+ " reader = csv.reader(csvfile)\n",
44
+ " \n",
45
+ " # Iterate over each row in the CSV file\n",
46
+ " for row in reader:\n",
47
+ " tag_name = row[0] # Assuming the first column contains the tag names\n",
48
+ " if tag_name.startswith('by_'):\n",
49
+ " # Strip 'by_' from the start of the tag name and add it to the set\n",
50
+ " artist_name = tag_name[3:] # Remove the first three characters 'by_'\n",
51
+ " artists.add(tag_name)\n",
52
+ "\n",
53
+ " return artists\n",
54
+ "\n",
55
+ "\n",
56
+ "def build_tag_list(tags, e621_rating_character, fav_count, artist_names):\n",
57
+ " results = []\n",
58
+ " \n",
59
+ " #score\n",
60
+ " score_value = min(1.0, (log(int(fav_count)+1) / 10))\n",
61
+ " rounded_score_value = round(score_value * 10)\n",
62
+ " results.append(f\"score: {rounded_score_value}\")\n",
63
+ " \n",
64
+ " #rating\n",
65
+ " results.append(\"rating:\" + e621_rating_character)\n",
66
+ " \n",
67
+ " #regular tags and artists\n",
68
+ " for tag in tags:\n",
69
+ " if tag in artist_names:\n",
70
+ " results.append(\"by_\" + tag)\n",
71
+ " else:\n",
72
+ " results.append(tag)\n",
73
+ " return results\n",
74
+ "\n",
75
+ "\n",
76
+ "def read_csv_as_dict(file_path):\n",
77
+ " \"\"\"\n",
78
+ " Generator function to read a gzipped CSV file and yield each row as a dictionary\n",
79
+ " where keys are the column names and values are the data in each column.\n",
80
+ "\n",
81
+ " :param file_path: Path to the .csv.gz file\n",
82
+ " \"\"\"\n",
83
+ " \n",
84
+ " #counter=0\n",
85
+ " with gzip.open(file_path, 'rt', newline='', encoding='utf-8') as gz_file:\n",
86
+ " csv.field_size_limit(1000000)\n",
87
+ " reader = csv.DictReader(gz_file)\n",
88
+ " for row in reader:\n",
89
+ " #counter += 1\n",
90
+ " #if counter % 100 == 0:\n",
91
+ " yield row\n",
92
+ " \n",
93
+ " \n",
94
+ "def process_tags_from_csv(file_path, artist_names):\n",
95
+ " \"\"\"\n",
96
+ " Generator function that reads rows from a CSV file, processes each row to extract and\n",
97
+ " build tag lists, and yields these lists one at a time.\n",
98
+ "\n",
99
+ " :param file_path: The path to the gzipped CSV file.\n",
100
+ " :param artist_names: A set containing all artist names for tag processing.\n",
101
+ " :return: Yields lists of tags for each row.\n",
102
+ " \"\"\"\n",
103
+ " for row in read_csv_as_dict(file_path):\n",
104
+ " base_tags = row['tag_string'].split(' ')\n",
105
+ " rating_character = row['rating']\n",
106
+ " fav_count = row['fav_count']\n",
107
+ " all_tags = build_tag_list(base_tags, rating_character, fav_count, artist_names)\n",
108
+ " yield all_tags\n",
109
+ " \n",
110
+ " \n",
111
+ "def construct_pseudo_vector(pseudo_doc_terms, idf_loaded, tag_to_column_loaded):\n",
112
+ " # Initialize a vector of zeros with the length of the term_to_index mapping\n",
113
+ " pseudo_vector = np.zeros(len(tag_to_column_loaded))\n",
114
+ " \n",
115
+ " # Fill in the vector for terms in the pseudo document\n",
116
+ " for term in pseudo_doc_terms:\n",
117
+ " if term in tag_to_column_loaded:\n",
118
+ " index = tag_to_column_loaded[term]\n",
119
+ " pseudo_vector[index] = idf_loaded.get(term, 0)\n",
120
+ " \n",
121
+ " # Return the vector as a 2D array for compatibility with SVD transform\n",
122
+ " return pseudo_vector.reshape(1, -1)"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": null,
128
+ "id": "0a9becfd",
129
+ "metadata": {},
130
+ "outputs": [],
131
+ "source": [
132
+ "all_artist_names = extract_artist_names(fluffyrock_tags_list_file)\n",
133
+ "\n",
134
+ "tag_count = Counter()\n",
135
+ "min_occurrences = 200\n",
136
+ " \n",
137
+ "for all_tags in process_tags_from_csv(posts_file, all_artist_names):\n",
138
+ " tag_count.update(all_tags)\n",
139
+ " \n",
140
+ "\n",
141
+ "# Apply the counting logic from the first code snippet\n",
142
+ "sorted_tags = tag_count.most_common()\n",
143
+ "filtered_tags = [tag for tag, count in sorted_tags if count >= min_occurrences]\n",
144
+ "\n",
145
+ "# Print tag counts before and after filtering\n",
146
+ "print(\"Tag count before filtering: \", len(tag_count))\n",
147
+ "print(\"Tag count after filtering: \", len(filtered_tags))"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "id": "56f8d7cd",
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": [
157
+ "# Initialize a dictionary to hold the co-occurrences for each tag in filtered_tags\n",
158
+ "# Using a nested defaultdict for automatic handling of missing keys\n",
159
+ "pseudo_docs = defaultdict(lambda: defaultdict(int))\n",
160
+ "\n",
161
+ "# Number of tags processed\n",
162
+ "total_rows_processed = 0\n",
163
+ "\n",
164
+ "# Read each row and process the tags\n",
165
+ "for all_tags in process_tags_from_csv(posts_file, all_artist_names):\n",
166
+ " # Filter the tags in the current list to include only those in filtered_tags\n",
167
+ " filtered_tag_list = [tag for tag in all_tags if tag in filtered_tags]\n",
168
+ " \n",
169
+ " # For each tag in the filtered list\n",
170
+ " for tag in filtered_tag_list:\n",
171
+ " # For each co-occurring tag in the same list\n",
172
+ " for co_occur_tag in filtered_tag_list:\n",
173
+ " if co_occur_tag != tag:\n",
174
+ " pseudo_docs[tag][co_occur_tag] += 1\n",
175
+ "\n",
176
+ " # Counting total tags processed for progress monitoring\n",
177
+ " total_rows_processed += 1\n",
178
+ " if total_rows_processed % 10000 == 0:\n",
179
+ " print(f\"Processed {total_rows_processed} rows\", file=sys.stderr)\n",
180
+ "\n",
181
+ "print(\"Processing complete.\")\n"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": null,
187
+ "id": "b1d011a5",
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "# Number of pseudo-documents\n",
192
+ "N = len(pseudo_docs)\n",
193
+ "\n",
194
+ "# Calculate TF and DF\n",
195
+ "tf = {}\n",
196
+ "df = {}\n",
197
+ "for doc, terms in pseudo_docs.items():\n",
198
+ " tf[doc] = {}\n",
199
+ " total_terms = sum(terms.values())\n",
200
+ " for term, count in terms.items():\n",
201
+ " tf[doc][term] = count / total_terms # Term Frequency\n",
202
+ " df[term] = df.get(term, 0) + 1 # Document Frequency\n",
203
+ " \n",
204
+ "# Ensure all terms are indexed\n",
205
+ "all_terms = set(df.keys())\n",
206
+ "term_to_column_index = {term: idx for idx, term in enumerate(all_terms)}\n",
207
+ "\n",
208
+ "# Calculate IDF\n",
209
+ "idf = {term: log((N + 1) / (df_val + 1)) for term, df_val in df.items()} # Adding 1 to prevent division by zero\n",
210
+ "\n",
211
+ "# Initialize the TF-IDF matrix\n",
212
+ "tfidf_matrix = dok_matrix((N, len(df)), dtype=float)\n",
213
+ "\n",
214
+ "# Mapping of tags to matrix rows\n",
215
+ "tag_to_row = {tag: idx for idx, tag in enumerate(pseudo_docs)}\n",
216
+ "\n",
217
+ "# Compute TF-IDF and fill the matrix\n",
218
+ "for doc, terms in tf.items():\n",
219
+ " row_idx = tag_to_row[doc]\n",
220
+ " for term, tf_val in terms.items():\n",
221
+ " col_idx = term_to_column_index[term] # Use term_to_index for column indexing\n",
222
+ " tfidf_matrix[row_idx, col_idx] = tf_val * idf[term]\n",
223
+ "\n",
224
+ "# Convert to CSR format for efficient row slicing\n",
225
+ "tfidf_matrix = tfidf_matrix.tocsr()\n",
226
+ "\n",
227
+ "print(\"TF-IDF matrix shape:\", tfidf_matrix.shape)\n"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": null,
233
+ "id": "b098a5fb",
234
+ "metadata": {},
235
+ "outputs": [],
236
+ "source": [
237
+ "# Choose the number of components for the reduced dimensionality\n",
238
+ "n_components = 300 # For example, reducing to 300 dimensions\n",
239
+ "\n",
240
+ "# Initialize the TruncatedSVD object\n",
241
+ "svd = TruncatedSVD(n_components=n_components, random_state=42)\n",
242
+ "\n",
243
+ "# Fit and transform the TF-IDF matrix\n",
244
+ "reduced_matrix = svd.fit_transform(tfidf_matrix)\n",
245
+ "\n",
246
+ "# 'reduced_matrix' now has a shape of (8500, n_components), e.g., (8500, 300)"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "023ae26f",
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": []
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "execution_count": null,
260
+ "id": "06ec21c4",
261
+ "metadata": {},
262
+ "outputs": [],
263
+ "source": [
264
+ "# Step 1: Construct TF vector for the pseudo-document\n",
265
+ "pseudo_doc_terms = [\"female\"]\n",
266
+ "pseudo_tfidf_vector = construct_pseudo_vector(pseudo_doc_terms, idf, term_to_column_index)\n",
267
+ "\n",
268
+ "# Assuming 'tfidf_matrix' is your original TF-IDF matrix and 'reduced_matrix' is obtained from Truncated SVD\n",
269
+ "# 'pseudo_tfidf_vector' is the TF-IDF vector for your pseudo-document, constructed as previously discussed\n",
270
+ "\n",
271
+ "# For the original TF-IDF matrix\n",
272
+ "# Compute cosine similarities\n",
273
+ "cosine_similarities_full = cosine_similarity(pseudo_tfidf_vector, tfidf_matrix).flatten()\n",
274
+ "print(\"Cosine similarities (full matrix):\", cosine_similarities_full)\n",
275
+ "# Identify the indices of the top 10 most similar tags\n",
276
+ "top_indices_full = np.argsort(cosine_similarities_full)[-10:][::-1]\n",
277
+ "\n",
278
+ "# For the reduced matrix\n",
279
+ "# Reduce the dimensionality of the pseudo-document vector\n",
280
+ "# Before calculating similarities, print the TF-IDF vectors\n",
281
+ "print(\"Pseudo TF-IDF vector:\", pseudo_tfidf_vector)\n",
282
+ "reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)\n",
283
+ "print(\"Reduced pseudo-document vector:\", reduced_pseudo_vector)\n",
284
+ "\n",
285
+ "# Compute cosine similarities in the reduced space\n",
286
+ "cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()\n",
287
+ "print(\"Cosine similarities (reduced matrix):\", cosine_similarities_reduced)\n",
288
+ "\n",
289
+ "\n",
290
+ "# Identify the indices of the top 10 most similar tags in the reduced space, sorted from most to least similar\n",
291
+ "top_indices_reduced = np.argsort(cosine_similarities_reduced)[-10:][::-1]\n",
292
+ "\n",
293
+ "\n",
294
+ "# Convert indices to tag names using the inverse of your 'tag_to_row' mapping\n",
295
+ "# Printing the tag to index and index to tag mappings\n",
296
+ "print(\"tag_to_row mapping (partial):\", dict(list(tag_to_row.items())[:12])) # Print only first 10 for brevity\n",
297
+ "row_to_tag = {idx: tag for tag, idx in tag_to_row.items()}\n",
298
+ "print(\"row_to_tag mapping (partial):\", dict(list(row_to_tag.items())[:12]))\n",
299
+ "\n",
300
+ "# Generate lists of tags with their corresponding similarity scores\n",
301
+ "top_tags_full = [(row_to_tag[idx], cosine_similarities_full[idx]) for idx in top_indices_full]\n",
302
+ "top_tags_reduced = [(row_to_tag[idx], cosine_similarities_reduced[idx]) for idx in top_indices_reduced]\n",
303
+ "\n",
304
+ "# Output the results with scores\n",
305
+ "print(\"Most similar tags (Full Matrix):\")\n",
306
+ "for tag, score in top_tags_full:\n",
307
+ " print(f\"{tag}: {score:.4f}\")\n",
308
+ "\n",
309
+ "print(\"Most similar tags (Reduced Matrix):\")\n",
310
+ "for tag, score in top_tags_reduced:\n",
311
+ " print(f\"{tag}: {score:.4f}\")\n"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": null,
317
+ "id": "91753fa3",
318
+ "metadata": {},
319
+ "outputs": [],
320
+ "source": [
321
+ "#Save the model to a file\n",
322
+ "\n",
323
+ "# Package necessary components\n",
324
+ "components_to_save = {\n",
325
+ " 'idf': idf,\n",
326
+ " 'tag_to_column_index': term_to_column_index,\n",
327
+ " 'row_to_tag': row_to_tag, \n",
328
+ " 'reduced_matrix': reduced_matrix,\n",
329
+ " 'svd_model': svd\n",
330
+ "}\n",
331
+ "\n",
332
+ "# Save the components into a file\n",
333
+ "joblib.dump(components_to_save, 'components_file418.joblib')"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": null,
339
+ "id": "2e08dc1a",
340
+ "metadata": {},
341
+ "outputs": [],
342
+ "source": []
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 3,
347
+ "id": "d066db2f",
348
+ "metadata": {},
349
+ "outputs": [
350
+ {
351
+ "name": "stdout",
352
+ "output_type": "stream",
353
+ "text": [
354
+ "Most similar tags (Reduced Matrix):\n",
355
+ "nameless_(arbuzbudesh): 0.0000\n",
356
+ "knotted_dildo: 0.0000\n",
357
+ "black_legs: 0.0000\n",
358
+ "disguise: 0.0000\n",
359
+ "lineup: 0.0000\n",
360
+ "olympics: 0.0000\n",
361
+ "burping: 0.0000\n",
362
+ "pink_collar: 0.0000\n",
363
+ "team_rocket: 0.0000\n",
364
+ "studded_bracelet: 0.0000\n"
365
+ ]
366
+ }
367
+ ],
368
+ "source": [
369
+ "#Reload and test file\n",
370
+ "\n",
371
+ "# Load the saved components from the joblib file\n",
372
+ "components = joblib.load('tf_idf_files_418_updated.joblib')\n",
373
+ "\n",
374
+ "# Extract necessary components\n",
375
+ "idf = components['idf']\n",
376
+ "term_to_column_index = components['tag_to_column_index']\n",
377
+ "row_to_tag = components['row_to_tag']\n",
378
+ "reduced_matrix = components['reduced_matrix']\n",
379
+ "svd = components['svd_model']\n",
380
+ "\n",
381
+ "# Construct the TF-IDF vector for \"domestic_dog\"\n",
382
+ "pseudo_tfidf_vector = construct_pseudo_vector(\"blue_(jurassic_world)\", idf, term_to_column_index)\n",
383
+ "\n",
384
+ "# Reduce the dimensionality of the pseudo-document vector for the reduced matrix\n",
385
+ "reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)\n",
386
+ "\n",
387
+ "# Compute cosine similarities in the reduced space\n",
388
+ "cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()\n",
389
+ "\n",
390
+ "# Sort the indices by descending cosine similarity\n",
391
+ "top_indices_reduced = np.argsort(cosine_similarities_reduced)[::-1][:10]\n",
392
+ "\n",
393
+ "# Display the most similar tags in the reduced matrix with their scores\n",
394
+ "print(\"Most similar tags (Reduced Matrix):\")\n",
395
+ "for idx in top_indices_reduced:\n",
396
+ " tag = row_to_tag[idx]\n",
397
+ " score = cosine_similarities_reduced[idx]\n",
398
+ " print(f\"{tag}: {score:.4f}\")\n"
399
+ ]
400
+ },
401
+ {
402
+ "cell_type": "code",
403
+ "execution_count": null,
404
+ "id": "ddea5f32",
405
+ "metadata": {},
406
+ "outputs": [],
407
+ "source": []
408
+ },
409
+ {
410
+ "cell_type": "code",
411
+ "execution_count": null,
412
+ "id": "74897a5c",
413
+ "metadata": {},
414
+ "outputs": [],
415
+ "source": []
416
+ },
417
+ {
418
+ "cell_type": "code",
419
+ "execution_count": null,
420
+ "id": "c0c5b32d",
421
+ "metadata": {},
422
+ "outputs": [],
423
+ "source": []
424
+ },
425
+ {
426
+ "cell_type": "code",
427
+ "execution_count": null,
428
+ "id": "9ff9a331",
429
+ "metadata": {},
430
+ "outputs": [],
431
+ "source": []
432
+ },
433
+ {
434
+ "cell_type": "code",
435
+ "execution_count": null,
436
+ "id": "91c66b57",
437
+ "metadata": {},
438
+ "outputs": [],
439
+ "source": []
440
+ },
441
+ {
442
+ "cell_type": "code",
443
+ "execution_count": null,
444
+ "id": "a830c6cf",
445
+ "metadata": {},
446
+ "outputs": [],
447
+ "source": []
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": null,
452
+ "id": "4cdc98f0",
453
+ "metadata": {},
454
+ "outputs": [],
455
+ "source": []
456
+ },
457
+ {
458
+ "cell_type": "code",
459
+ "execution_count": null,
460
+ "id": "150d66f3",
461
+ "metadata": {},
462
+ "outputs": [],
463
+ "source": []
464
+ },
465
+ {
466
+ "cell_type": "code",
467
+ "execution_count": null,
468
+ "id": "337b1f65",
469
+ "metadata": {},
470
+ "outputs": [],
471
+ "source": []
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": null,
476
+ "id": "34d2fde1",
477
+ "metadata": {},
478
+ "outputs": [],
479
+ "source": []
480
+ },
481
+ {
482
+ "cell_type": "code",
483
+ "execution_count": null,
484
+ "id": "9fc197d8",
485
+ "metadata": {},
486
+ "outputs": [],
487
+ "source": []
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "execution_count": null,
492
+ "id": "bfa9c299",
493
+ "metadata": {},
494
+ "outputs": [],
495
+ "source": []
496
+ },
497
+ {
498
+ "cell_type": "code",
499
+ "execution_count": null,
500
+ "id": "551a8453",
501
+ "metadata": {},
502
+ "outputs": [],
503
+ "source": []
504
+ },
505
+ {
506
+ "cell_type": "code",
507
+ "execution_count": null,
508
+ "id": "0dcdeb9e",
509
+ "metadata": {},
510
+ "outputs": [],
511
+ "source": []
512
+ },
513
+ {
514
+ "cell_type": "code",
515
+ "execution_count": null,
516
+ "id": "537c9e26",
517
+ "metadata": {},
518
+ "outputs": [],
519
+ "source": []
520
+ },
521
+ {
522
+ "cell_type": "code",
523
+ "execution_count": null,
524
+ "id": "aa873abf",
525
+ "metadata": {},
526
+ "outputs": [],
527
+ "source": []
528
+ },
529
+ {
530
+ "cell_type": "code",
531
+ "execution_count": null,
532
+ "id": "41aca76f",
533
+ "metadata": {},
534
+ "outputs": [],
535
+ "source": []
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "execution_count": null,
540
+ "id": "36a3ae96",
541
+ "metadata": {},
542
+ "outputs": [],
543
+ "source": []
544
+ },
545
+ {
546
+ "cell_type": "code",
547
+ "execution_count": null,
548
+ "id": "fb59bac3",
549
+ "metadata": {},
550
+ "outputs": [],
551
+ "source": []
552
+ },
553
+ {
554
+ "cell_type": "code",
555
+ "execution_count": null,
556
+ "id": "39c87db9",
557
+ "metadata": {},
558
+ "outputs": [],
559
+ "source": []
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": null,
564
+ "id": "1646e731",
565
+ "metadata": {},
566
+ "outputs": [],
567
+ "source": []
568
+ },
569
+ {
570
+ "cell_type": "code",
571
+ "execution_count": null,
572
+ "id": "99f95d09",
573
+ "metadata": {},
574
+ "outputs": [],
575
+ "source": []
576
+ },
577
+ {
578
+ "cell_type": "code",
579
+ "execution_count": null,
580
+ "id": "9d6a67c2",
581
+ "metadata": {},
582
+ "outputs": [],
583
+ "source": []
584
+ },
585
+ {
586
+ "cell_type": "code",
587
+ "execution_count": null,
588
+ "id": "32acbfd7",
589
+ "metadata": {},
590
+ "outputs": [],
591
+ "source": []
592
+ },
593
+ {
594
+ "cell_type": "code",
595
+ "execution_count": null,
596
+ "id": "3c17cd42",
597
+ "metadata": {},
598
+ "outputs": [],
599
+ "source": []
600
+ },
601
+ {
602
+ "cell_type": "code",
603
+ "execution_count": null,
604
+ "id": "d333776c",
605
+ "metadata": {},
606
+ "outputs": [],
607
+ "source": []
608
+ },
609
+ {
610
+ "cell_type": "code",
611
+ "execution_count": null,
612
+ "id": "1e8c7511",
613
+ "metadata": {},
614
+ "outputs": [],
615
+ "source": []
616
+ },
617
+ {
618
+ "cell_type": "code",
619
+ "execution_count": null,
620
+ "id": "acf35591",
621
+ "metadata": {},
622
+ "outputs": [],
623
+ "source": []
624
+ },
625
+ {
626
+ "cell_type": "code",
627
+ "execution_count": null,
628
+ "id": "101fb083",
629
+ "metadata": {},
630
+ "outputs": [],
631
+ "source": []
632
+ },
633
+ {
634
+ "cell_type": "code",
635
+ "execution_count": null,
636
+ "id": "f8bd8551",
637
+ "metadata": {},
638
+ "outputs": [],
639
+ "source": []
640
+ },
641
+ {
642
+ "cell_type": "code",
643
+ "execution_count": null,
644
+ "id": "271b9c12",
645
+ "metadata": {},
646
+ "outputs": [],
647
+ "source": []
648
+ },
649
+ {
650
+ "cell_type": "code",
651
+ "execution_count": null,
652
+ "id": "a232e088",
653
+ "metadata": {},
654
+ "outputs": [],
655
+ "source": []
656
+ },
657
+ {
658
+ "cell_type": "code",
659
+ "execution_count": null,
660
+ "id": "43df0240",
661
+ "metadata": {},
662
+ "outputs": [],
663
+ "source": []
664
+ },
665
+ {
666
+ "cell_type": "code",
667
+ "execution_count": null,
668
+ "id": "8dbb05e8",
669
+ "metadata": {},
670
+ "outputs": [],
671
+ "source": [
672
+ "\n"
673
+ ]
674
+ },
675
+ {
676
+ "cell_type": "code",
677
+ "execution_count": null,
678
+ "id": "9730cb16",
679
+ "metadata": {},
680
+ "outputs": [],
681
+ "source": []
682
+ },
683
+ {
684
+ "cell_type": "code",
685
+ "execution_count": null,
686
+ "id": "d38f92b2",
687
+ "metadata": {},
688
+ "outputs": [],
689
+ "source": []
690
+ },
691
+ {
692
+ "cell_type": "code",
693
+ "execution_count": null,
694
+ "id": "879f5463",
695
+ "metadata": {},
696
+ "outputs": [],
697
+ "source": []
698
+ }
699
+ ],
700
+ "metadata": {
701
+ "kernelspec": {
702
+ "display_name": "Python 3 (ipykernel)",
703
+ "language": "python",
704
+ "name": "python3"
705
+ },
706
+ "language_info": {
707
+ "codemirror_mode": {
708
+ "name": "ipython",
709
+ "version": 3
710
+ },
711
+ "file_extension": ".py",
712
+ "mimetype": "text/x-python",
713
+ "name": "python",
714
+ "nbconvert_exporter": "python",
715
+ "pygments_lexer": "ipython3",
716
+ "version": "3.10.9"
717
+ }
718
+ },
719
+ "nbformat": 4,
720
+ "nbformat_minor": 5
721
+ }