{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "55c95870", "metadata": {}, "outputs": [], "source": [ "import csv\n", "import gzip\n", "from math import log\n", "from collections import Counter\n", "from sys import maxsize\n", "import numpy as np\n", "import joblib\n", "from collections import OrderedDict\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from collections import defaultdict\n", "import sys\n", "from scipy.sparse import dok_matrix\n", "from sklearn.preprocessing import normalize\n", "from sklearn.decomposition import TruncatedSVD\n", "\n", "\n", "\n", "posts_file = 'posts-2024-04-14.csv.gz'\n", "fluffyrock_tags_list_file = 'fluffyrock_3m.csv'\n", "\n", "\n", "def extract_artist_names(file_path):\n", " \"\"\"\n", " Extract artist names from a CSV file where each row contains tag information,\n", " and the first column contains the tag's name. Artist tags start with 'by_'.\n", "\n", " :param file_path: Path to the CSV file\n", " :return: A set containing artist names without the 'by_' prefix\n", " \"\"\"\n", " artists = set()\n", "\n", " # Open the CSV file and read it\n", " with open(file_path, newline='', encoding='utf-8') as csvfile:\n", " reader = csv.reader(csvfile)\n", " \n", " # Iterate over each row in the CSV file\n", " for row in reader:\n", " tag_name = row[0] # Assuming the first column contains the tag names\n", " if tag_name.startswith('by_'):\n", " # Strip 'by_' from the start of the tag name and add it to the set\n", " artist_name = tag_name[3:] # Remove the first three characters 'by_'\n", " artists.add(tag_name)\n", "\n", " return artists\n", "\n", "\n", "def build_tag_list(tags, e621_rating_character, fav_count, artist_names):\n", " results = []\n", " \n", " #score\n", " score_value = min(1.0, (log(int(fav_count)+1) / 10))\n", " rounded_score_value = round(score_value * 10)\n", " results.append(f\"score: {rounded_score_value}\")\n", " \n", " #rating\n", " results.append(\"rating:\" + e621_rating_character)\n", " \n", " #regular tags and artists\n", " for tag in tags:\n", " if tag in artist_names:\n", " results.append(\"by_\" + tag)\n", " else:\n", " results.append(tag)\n", " return results\n", "\n", "\n", "def read_csv_as_dict(file_path):\n", " \"\"\"\n", " Generator function to read a gzipped CSV file and yield each row as a dictionary\n", " where keys are the column names and values are the data in each column.\n", "\n", " :param file_path: Path to the .csv.gz file\n", " \"\"\"\n", " \n", " #counter=0\n", " with gzip.open(file_path, 'rt', newline='', encoding='utf-8') as gz_file:\n", " csv.field_size_limit(1000000)\n", " reader = csv.DictReader(gz_file)\n", " for row in reader:\n", " #counter += 1\n", " #if counter % 100 == 0:\n", " yield row\n", " \n", " \n", "def process_tags_from_csv(file_path, artist_names):\n", " \"\"\"\n", " Generator function that reads rows from a CSV file, processes each row to extract and\n", " build tag lists, and yields these lists one at a time.\n", "\n", " :param file_path: The path to the gzipped CSV file.\n", " :param artist_names: A set containing all artist names for tag processing.\n", " :return: Yields lists of tags for each row.\n", " \"\"\"\n", " for row in read_csv_as_dict(file_path):\n", " base_tags = row['tag_string'].split(' ')\n", " rating_character = row['rating']\n", " fav_count = row['fav_count']\n", " all_tags = build_tag_list(base_tags, rating_character, fav_count, artist_names)\n", " yield all_tags\n", " \n", " \n", "def construct_pseudo_vector(pseudo_doc_terms, idf_loaded, tag_to_column_loaded):\n", " # Initialize a vector of zeros with the length of the term_to_index mapping\n", " pseudo_vector = np.zeros(len(tag_to_column_loaded))\n", " \n", " # Fill in the vector for terms in the pseudo document\n", " for term in pseudo_doc_terms:\n", " if term in tag_to_column_loaded:\n", " index = tag_to_column_loaded[term]\n", " pseudo_vector[index] = idf_loaded.get(term, 0)\n", " \n", " # Return the vector as a 2D array for compatibility with SVD transform\n", " return pseudo_vector.reshape(1, -1)" ] }, { "cell_type": "code", "execution_count": null, "id": "0a9becfd", "metadata": {}, "outputs": [], "source": [ "all_artist_names = extract_artist_names(fluffyrock_tags_list_file)\n", "\n", "tag_count = Counter()\n", "min_occurrences = 200\n", " \n", "for all_tags in process_tags_from_csv(posts_file, all_artist_names):\n", " tag_count.update(all_tags)\n", " \n", "\n", "# Apply the counting logic from the first code snippet\n", "sorted_tags = tag_count.most_common()\n", "filtered_tags = [tag for tag, count in sorted_tags if count >= min_occurrences]\n", "\n", "# Print tag counts before and after filtering\n", "print(\"Tag count before filtering: \", len(tag_count))\n", "print(\"Tag count after filtering: \", len(filtered_tags))" ] }, { "cell_type": "code", "execution_count": null, "id": "56f8d7cd", "metadata": {}, "outputs": [], "source": [ "# Initialize a dictionary to hold the co-occurrences for each tag in filtered_tags\n", "# Using a nested defaultdict for automatic handling of missing keys\n", "pseudo_docs = defaultdict(lambda: defaultdict(int))\n", "\n", "# Number of tags processed\n", "total_rows_processed = 0\n", "\n", "# Read each row and process the tags\n", "for all_tags in process_tags_from_csv(posts_file, all_artist_names):\n", " # Filter the tags in the current list to include only those in filtered_tags\n", " filtered_tag_list = [tag for tag in all_tags if tag in filtered_tags]\n", " \n", " # For each tag in the filtered list\n", " for tag in filtered_tag_list:\n", " # For each co-occurring tag in the same list\n", " for co_occur_tag in filtered_tag_list:\n", " if co_occur_tag != tag:\n", " pseudo_docs[tag][co_occur_tag] += 1\n", "\n", " # Counting total tags processed for progress monitoring\n", " total_rows_processed += 1\n", " if total_rows_processed % 10000 == 0:\n", " print(f\"Processed {total_rows_processed} rows\", file=sys.stderr)\n", "\n", "print(\"Processing complete.\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b1d011a5", "metadata": {}, "outputs": [], "source": [ "# Number of pseudo-documents\n", "N = len(pseudo_docs)\n", "\n", "# Calculate TF and DF\n", "tf = {}\n", "df = {}\n", "for doc, terms in pseudo_docs.items():\n", " tf[doc] = {}\n", " total_terms = sum(terms.values())\n", " for term, count in terms.items():\n", " tf[doc][term] = count / total_terms # Term Frequency\n", " df[term] = df.get(term, 0) + 1 # Document Frequency\n", " \n", "# Ensure all terms are indexed\n", "all_terms = set(df.keys())\n", "term_to_column_index = {term: idx for idx, term in enumerate(all_terms)}\n", "\n", "# Calculate IDF\n", "idf = {term: log((N + 1) / (df_val + 1)) for term, df_val in df.items()} # Adding 1 to prevent division by zero\n", "\n", "# Initialize the TF-IDF matrix\n", "tfidf_matrix = dok_matrix((N, len(df)), dtype=float)\n", "\n", "# Mapping of tags to matrix rows\n", "tag_to_row = {tag: idx for idx, tag in enumerate(pseudo_docs)}\n", "\n", "# Compute TF-IDF and fill the matrix\n", "for doc, terms in tf.items():\n", " row_idx = tag_to_row[doc]\n", " for term, tf_val in terms.items():\n", " col_idx = term_to_column_index[term] # Use term_to_index for column indexing\n", " tfidf_matrix[row_idx, col_idx] = tf_val * idf[term]\n", "\n", "# Convert to CSR format for efficient row slicing\n", "tfidf_matrix = tfidf_matrix.tocsr()\n", "\n", "print(\"TF-IDF matrix shape:\", tfidf_matrix.shape)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b098a5fb", "metadata": {}, "outputs": [], "source": [ "# Choose the number of components for the reduced dimensionality\n", "n_components = 300 # For example, reducing to 300 dimensions\n", "\n", "# Initialize the TruncatedSVD object\n", "svd = TruncatedSVD(n_components=n_components, random_state=42)\n", "\n", "# Fit and transform the TF-IDF matrix\n", "reduced_matrix = svd.fit_transform(tfidf_matrix)\n", "\n", "# 'reduced_matrix' now has a shape of (8500, n_components), e.g., (8500, 300)" ] }, { "cell_type": "code", "execution_count": null, "id": "023ae26f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "06ec21c4", "metadata": {}, "outputs": [], "source": [ "# Step 1: Construct TF vector for the pseudo-document\n", "pseudo_doc_terms = [\"female\"]\n", "pseudo_tfidf_vector = construct_pseudo_vector(pseudo_doc_terms, idf, term_to_column_index)\n", "\n", "# Assuming 'tfidf_matrix' is your original TF-IDF matrix and 'reduced_matrix' is obtained from Truncated SVD\n", "# 'pseudo_tfidf_vector' is the TF-IDF vector for your pseudo-document, constructed as previously discussed\n", "\n", "# For the original TF-IDF matrix\n", "# Compute cosine similarities\n", "cosine_similarities_full = cosine_similarity(pseudo_tfidf_vector, tfidf_matrix).flatten()\n", "print(\"Cosine similarities (full matrix):\", cosine_similarities_full)\n", "# Identify the indices of the top 10 most similar tags\n", "top_indices_full = np.argsort(cosine_similarities_full)[-10:][::-1]\n", "\n", "# For the reduced matrix\n", "# Reduce the dimensionality of the pseudo-document vector\n", "# Before calculating similarities, print the TF-IDF vectors\n", "print(\"Pseudo TF-IDF vector:\", pseudo_tfidf_vector)\n", "reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)\n", "print(\"Reduced pseudo-document vector:\", reduced_pseudo_vector)\n", "\n", "# Compute cosine similarities in the reduced space\n", "cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()\n", "print(\"Cosine similarities (reduced matrix):\", cosine_similarities_reduced)\n", "\n", "\n", "# Identify the indices of the top 10 most similar tags in the reduced space, sorted from most to least similar\n", "top_indices_reduced = np.argsort(cosine_similarities_reduced)[-10:][::-1]\n", "\n", "\n", "# Convert indices to tag names using the inverse of your 'tag_to_row' mapping\n", "# Printing the tag to index and index to tag mappings\n", "print(\"tag_to_row mapping (partial):\", dict(list(tag_to_row.items())[:12])) # Print only first 10 for brevity\n", "row_to_tag = {idx: tag for tag, idx in tag_to_row.items()}\n", "print(\"row_to_tag mapping (partial):\", dict(list(row_to_tag.items())[:12]))\n", "\n", "# Generate lists of tags with their corresponding similarity scores\n", "top_tags_full = [(row_to_tag[idx], cosine_similarities_full[idx]) for idx in top_indices_full]\n", "top_tags_reduced = [(row_to_tag[idx], cosine_similarities_reduced[idx]) for idx in top_indices_reduced]\n", "\n", "# Output the results with scores\n", "print(\"Most similar tags (Full Matrix):\")\n", "for tag, score in top_tags_full:\n", " print(f\"{tag}: {score:.4f}\")\n", "\n", "print(\"Most similar tags (Reduced Matrix):\")\n", "for tag, score in top_tags_reduced:\n", " print(f\"{tag}: {score:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "91753fa3", "metadata": {}, "outputs": [], "source": [ "#Save the model to a file\n", "\n", "# Package necessary components\n", "components_to_save = {\n", " 'idf': idf,\n", " 'tag_to_column_index': term_to_column_index,\n", " 'row_to_tag': row_to_tag, \n", " 'reduced_matrix': reduced_matrix,\n", " 'svd_model': svd\n", "}\n", "\n", "# Save the components into a file\n", "joblib.dump(components_to_save, 'components_file418.joblib')" ] }, { "cell_type": "code", "execution_count": null, "id": "2e08dc1a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 3, "id": "d066db2f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Most similar tags (Reduced Matrix):\n", "nameless_(arbuzbudesh): 0.0000\n", "knotted_dildo: 0.0000\n", "black_legs: 0.0000\n", "disguise: 0.0000\n", "lineup: 0.0000\n", "olympics: 0.0000\n", "burping: 0.0000\n", "pink_collar: 0.0000\n", "team_rocket: 0.0000\n", "studded_bracelet: 0.0000\n" ] } ], "source": [ "#Reload and test file\n", "\n", "# Load the saved components from the joblib file\n", "components = joblib.load('tf_idf_files_418_updated.joblib')\n", "\n", "# Extract necessary components\n", "idf = components['idf']\n", "term_to_column_index = components['tag_to_column_index']\n", "row_to_tag = components['row_to_tag']\n", "reduced_matrix = components['reduced_matrix']\n", "svd = components['svd_model']\n", "\n", "# Construct the TF-IDF vector for \"domestic_dog\"\n", "pseudo_tfidf_vector = construct_pseudo_vector(\"blue_(jurassic_world)\", idf, term_to_column_index)\n", "\n", "# Reduce the dimensionality of the pseudo-document vector for the reduced matrix\n", "reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)\n", "\n", "# Compute cosine similarities in the reduced space\n", "cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()\n", "\n", "# Sort the indices by descending cosine similarity\n", "top_indices_reduced = np.argsort(cosine_similarities_reduced)[::-1][:10]\n", "\n", "# Display the most similar tags in the reduced matrix with their scores\n", "print(\"Most similar tags (Reduced Matrix):\")\n", "for idx in top_indices_reduced:\n", " tag = row_to_tag[idx]\n", " score = cosine_similarities_reduced[idx]\n", " print(f\"{tag}: {score:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ddea5f32", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "74897a5c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c0c5b32d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "9ff9a331", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "91c66b57", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a830c6cf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "4cdc98f0", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "150d66f3", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "337b1f65", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "34d2fde1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "9fc197d8", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "bfa9c299", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "551a8453", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0dcdeb9e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "537c9e26", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "aa873abf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "41aca76f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "36a3ae96", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "fb59bac3", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "39c87db9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1646e731", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "99f95d09", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "9d6a67c2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "32acbfd7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3c17cd42", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d333776c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1e8c7511", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "acf35591", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "101fb083", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "f8bd8551", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "271b9c12", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a232e088", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "43df0240", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "8dbb05e8", "metadata": {}, "outputs": [], "source": [ "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9730cb16", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d38f92b2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "879f5463", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }