Spaces:

FoodDesert
/

Prompt_Squirrel

Running

File size: 20,842 Bytes

8b24305

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "55c95870",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import gzip\n",
    "from math import log\n",
    "from collections import Counter\n",
    "from sys import maxsize\n",
    "import numpy as np\n",
    "import joblib\n",
    "from collections import OrderedDict\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from collections import defaultdict\n",
    "import sys\n",
    "from scipy.sparse import dok_matrix\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "\n",
    "\n",
    "\n",
    "posts_file = 'posts-2024-04-14.csv.gz'\n",
    "fluffyrock_tags_list_file = 'fluffyrock_3m.csv'\n",
    "\n",
    "\n",
    "def extract_artist_names(file_path):\n",
    "    \"\"\"\n",
    "    Extract artist names from a CSV file where each row contains tag information,\n",
    "    and the first column contains the tag's name. Artist tags start with 'by_'.\n",
    "\n",
    "    :param file_path: Path to the CSV file\n",
    "    :return: A set containing artist names without the 'by_' prefix\n",
    "    \"\"\"\n",
    "    artists = set()\n",
    "\n",
    "    # Open the CSV file and read it\n",
    "    with open(file_path, newline='', encoding='utf-8') as csvfile:\n",
    "        reader = csv.reader(csvfile)\n",
    "        \n",
    "        # Iterate over each row in the CSV file\n",
    "        for row in reader:\n",
    "            tag_name = row[0]  # Assuming the first column contains the tag names\n",
    "            if tag_name.startswith('by_'):\n",
    "                # Strip 'by_' from the start of the tag name and add it to the set\n",
    "                artist_name = tag_name[3:]  # Remove the first three characters 'by_'\n",
    "                artists.add(tag_name)\n",
    "\n",
    "    return artists\n",
    "\n",
    "\n",
    "def build_tag_list(tags, e621_rating_character, fav_count, artist_names):\n",
    "    results = []\n",
    "    \n",
    "    #score\n",
    "    score_value = min(1.0, (log(int(fav_count)+1) / 10))\n",
    "    rounded_score_value = round(score_value * 10)\n",
    "    results.append(f\"score: {rounded_score_value}\")\n",
    "        \n",
    "    #rating\n",
    "    results.append(\"rating:\" + e621_rating_character)\n",
    "    \n",
    "    #regular tags and artists\n",
    "    for tag in tags:\n",
    "        if tag in artist_names:\n",
    "            results.append(\"by_\" + tag)\n",
    "        else:\n",
    "            results.append(tag)\n",
    "    return results\n",
    "\n",
    "\n",
    "def read_csv_as_dict(file_path):\n",
    "    \"\"\"\n",
    "    Generator function to read a gzipped CSV file and yield each row as a dictionary\n",
    "    where keys are the column names and values are the data in each column.\n",
    "\n",
    "    :param file_path: Path to the .csv.gz file\n",
    "    \"\"\"\n",
    "    \n",
    "    #counter=0\n",
    "    with gzip.open(file_path, 'rt', newline='', encoding='utf-8') as gz_file:\n",
    "        csv.field_size_limit(1000000)\n",
    "        reader = csv.DictReader(gz_file)\n",
    "        for row in reader:\n",
    "            #counter += 1\n",
    "            #if counter % 100 == 0:\n",
    "            yield row\n",
    "            \n",
    "            \n",
    "def process_tags_from_csv(file_path, artist_names):\n",
    "    \"\"\"\n",
    "    Generator function that reads rows from a CSV file, processes each row to extract and\n",
    "    build tag lists, and yields these lists one at a time.\n",
    "\n",
    "    :param file_path: The path to the gzipped CSV file.\n",
    "    :param artist_names: A set containing all artist names for tag processing.\n",
    "    :return: Yields lists of tags for each row.\n",
    "    \"\"\"\n",
    "    for row in read_csv_as_dict(file_path):\n",
    "        base_tags = row['tag_string'].split(' ')\n",
    "        rating_character = row['rating']\n",
    "        fav_count = row['fav_count']\n",
    "        all_tags = build_tag_list(base_tags, rating_character, fav_count, artist_names)\n",
    "        yield all_tags\n",
    "        \n",
    "        \n",
    "def construct_pseudo_vector(pseudo_doc_terms, idf_loaded, tag_to_column_loaded):\n",
    "    # Initialize a vector of zeros with the length of the term_to_index mapping\n",
    "    pseudo_vector = np.zeros(len(tag_to_column_loaded))\n",
    "    \n",
    "    # Fill in the vector for terms in the pseudo document\n",
    "    for term in pseudo_doc_terms:\n",
    "        if term in tag_to_column_loaded:\n",
    "            index = tag_to_column_loaded[term]\n",
    "            pseudo_vector[index] = idf_loaded.get(term, 0)\n",
    "    \n",
    "    # Return the vector as a 2D array for compatibility with SVD transform\n",
    "    return pseudo_vector.reshape(1, -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a9becfd",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_artist_names = extract_artist_names(fluffyrock_tags_list_file)\n",
    "\n",
    "tag_count = Counter()\n",
    "min_occurrences = 200\n",
    "    \n",
    "for all_tags in process_tags_from_csv(posts_file, all_artist_names):\n",
    "    tag_count.update(all_tags)\n",
    "    \n",
    "\n",
    "# Apply the counting logic from the first code snippet\n",
    "sorted_tags = tag_count.most_common()\n",
    "filtered_tags = [tag for tag, count in sorted_tags if count >= min_occurrences]\n",
    "\n",
    "# Print tag counts before and after filtering\n",
    "print(\"Tag count before filtering: \", len(tag_count))\n",
    "print(\"Tag count after filtering: \", len(filtered_tags))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56f8d7cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize a dictionary to hold the co-occurrences for each tag in filtered_tags\n",
    "# Using a nested defaultdict for automatic handling of missing keys\n",
    "pseudo_docs = defaultdict(lambda: defaultdict(int))\n",
    "\n",
    "# Number of tags processed\n",
    "total_rows_processed = 0\n",
    "\n",
    "# Read each row and process the tags\n",
    "for all_tags in process_tags_from_csv(posts_file, all_artist_names):\n",
    "    # Filter the tags in the current list to include only those in filtered_tags\n",
    "    filtered_tag_list = [tag for tag in all_tags if tag in filtered_tags]\n",
    "    \n",
    "    # For each tag in the filtered list\n",
    "    for tag in filtered_tag_list:\n",
    "        # For each co-occurring tag in the same list\n",
    "        for co_occur_tag in filtered_tag_list:\n",
    "            if co_occur_tag != tag:\n",
    "                pseudo_docs[tag][co_occur_tag] += 1\n",
    "\n",
    "    # Counting total tags processed for progress monitoring\n",
    "    total_rows_processed += 1\n",
    "    if total_rows_processed % 10000 == 0:\n",
    "        print(f\"Processed {total_rows_processed} rows\", file=sys.stderr)\n",
    "\n",
    "print(\"Processing complete.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1d011a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of pseudo-documents\n",
    "N = len(pseudo_docs)\n",
    "\n",
    "# Calculate TF and DF\n",
    "tf = {}\n",
    "df = {}\n",
    "for doc, terms in pseudo_docs.items():\n",
    "    tf[doc] = {}\n",
    "    total_terms = sum(terms.values())\n",
    "    for term, count in terms.items():\n",
    "        tf[doc][term] = count / total_terms  # Term Frequency\n",
    "        df[term] = df.get(term, 0) + 1  # Document Frequency\n",
    "        \n",
    "# Ensure all terms are indexed\n",
    "all_terms = set(df.keys())\n",
    "term_to_column_index = {term: idx for idx, term in enumerate(all_terms)}\n",
    "\n",
    "# Calculate IDF\n",
    "idf = {term: log((N + 1) / (df_val + 1)) for term, df_val in df.items()}  # Adding 1 to prevent division by zero\n",
    "\n",
    "# Initialize the TF-IDF matrix\n",
    "tfidf_matrix = dok_matrix((N, len(df)), dtype=float)\n",
    "\n",
    "# Mapping of tags to matrix rows\n",
    "tag_to_row = {tag: idx for idx, tag in enumerate(pseudo_docs)}\n",
    "\n",
    "# Compute TF-IDF and fill the matrix\n",
    "for doc, terms in tf.items():\n",
    "    row_idx = tag_to_row[doc]\n",
    "    for term, tf_val in terms.items():\n",
    "        col_idx = term_to_column_index[term]  # Use term_to_index for column indexing\n",
    "        tfidf_matrix[row_idx, col_idx] = tf_val * idf[term]\n",
    "\n",
    "# Convert to CSR format for efficient row slicing\n",
    "tfidf_matrix = tfidf_matrix.tocsr()\n",
    "\n",
    "print(\"TF-IDF matrix shape:\", tfidf_matrix.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b098a5fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Choose the number of components for the reduced dimensionality\n",
    "n_components = 300  # For example, reducing to 300 dimensions\n",
    "\n",
    "# Initialize the TruncatedSVD object\n",
    "svd = TruncatedSVD(n_components=n_components, random_state=42)\n",
    "\n",
    "# Fit and transform the TF-IDF matrix\n",
    "reduced_matrix = svd.fit_transform(tfidf_matrix)\n",
    "\n",
    "# 'reduced_matrix' now has a shape of (8500, n_components), e.g., (8500, 300)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "023ae26f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06ec21c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Construct TF vector for the pseudo-document\n",
    "pseudo_doc_terms = [\"female\"]\n",
    "pseudo_tfidf_vector = construct_pseudo_vector(pseudo_doc_terms, idf, term_to_column_index)\n",
    "\n",
    "# Assuming 'tfidf_matrix' is your original TF-IDF matrix and 'reduced_matrix' is obtained from Truncated SVD\n",
    "# 'pseudo_tfidf_vector' is the TF-IDF vector for your pseudo-document, constructed as previously discussed\n",
    "\n",
    "# For the original TF-IDF matrix\n",
    "# Compute cosine similarities\n",
    "cosine_similarities_full = cosine_similarity(pseudo_tfidf_vector, tfidf_matrix).flatten()\n",
    "print(\"Cosine similarities (full matrix):\", cosine_similarities_full)\n",
    "# Identify the indices of the top 10 most similar tags\n",
    "top_indices_full = np.argsort(cosine_similarities_full)[-10:][::-1]\n",
    "\n",
    "# For the reduced matrix\n",
    "# Reduce the dimensionality of the pseudo-document vector\n",
    "# Before calculating similarities, print the TF-IDF vectors\n",
    "print(\"Pseudo TF-IDF vector:\", pseudo_tfidf_vector)\n",
    "reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)\n",
    "print(\"Reduced pseudo-document vector:\", reduced_pseudo_vector)\n",
    "\n",
    "# Compute cosine similarities in the reduced space\n",
    "cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()\n",
    "print(\"Cosine similarities (reduced matrix):\", cosine_similarities_reduced)\n",
    "\n",
    "\n",
    "# Identify the indices of the top 10 most similar tags in the reduced space, sorted from most to least similar\n",
    "top_indices_reduced = np.argsort(cosine_similarities_reduced)[-10:][::-1]\n",
    "\n",
    "\n",
    "# Convert indices to tag names using the inverse of your 'tag_to_row' mapping\n",
    "# Printing the tag to index and index to tag mappings\n",
    "print(\"tag_to_row mapping (partial):\", dict(list(tag_to_row.items())[:12]))  # Print only first 10 for brevity\n",
    "row_to_tag = {idx: tag for tag, idx in tag_to_row.items()}\n",
    "print(\"row_to_tag mapping (partial):\", dict(list(row_to_tag.items())[:12]))\n",
    "\n",
    "# Generate lists of tags with their corresponding similarity scores\n",
    "top_tags_full = [(row_to_tag[idx], cosine_similarities_full[idx]) for idx in top_indices_full]\n",
    "top_tags_reduced = [(row_to_tag[idx], cosine_similarities_reduced[idx]) for idx in top_indices_reduced]\n",
    "\n",
    "# Output the results with scores\n",
    "print(\"Most similar tags (Full Matrix):\")\n",
    "for tag, score in top_tags_full:\n",
    "    print(f\"{tag}: {score:.4f}\")\n",
    "\n",
    "print(\"Most similar tags (Reduced Matrix):\")\n",
    "for tag, score in top_tags_reduced:\n",
    "    print(f\"{tag}: {score:.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91753fa3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Save the model to a file\n",
    "\n",
    "# Package necessary components\n",
    "components_to_save = {\n",
    "    'idf': idf,\n",
    "    'tag_to_column_index': term_to_column_index,\n",
    "    'row_to_tag': row_to_tag, \n",
    "    'reduced_matrix': reduced_matrix,\n",
    "    'svd_model': svd\n",
    "}\n",
    "\n",
    "# Save the components into a file\n",
    "joblib.dump(components_to_save, 'components_file418.joblib')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e08dc1a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d066db2f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most similar tags (Reduced Matrix):\n",
      "nameless_(arbuzbudesh): 0.0000\n",
      "knotted_dildo: 0.0000\n",
      "black_legs: 0.0000\n",
      "disguise: 0.0000\n",
      "lineup: 0.0000\n",
      "olympics: 0.0000\n",
      "burping: 0.0000\n",
      "pink_collar: 0.0000\n",
      "team_rocket: 0.0000\n",
      "studded_bracelet: 0.0000\n"
     ]
    }
   ],
   "source": [
    "#Reload and test file\n",
    "\n",
    "# Load the saved components from the joblib file\n",
    "components = joblib.load('tf_idf_files_418_updated.joblib')\n",
    "\n",
    "# Extract necessary components\n",
    "idf = components['idf']\n",
    "term_to_column_index = components['tag_to_column_index']\n",
    "row_to_tag = components['row_to_tag']\n",
    "reduced_matrix = components['reduced_matrix']\n",
    "svd = components['svd_model']\n",
    "\n",
    "# Construct the TF-IDF vector for \"domestic_dog\"\n",
    "pseudo_tfidf_vector = construct_pseudo_vector(\"blue_(jurassic_world)\", idf, term_to_column_index)\n",
    "\n",
    "# Reduce the dimensionality of the pseudo-document vector for the reduced matrix\n",
    "reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)\n",
    "\n",
    "# Compute cosine similarities in the reduced space\n",
    "cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()\n",
    "\n",
    "# Sort the indices by descending cosine similarity\n",
    "top_indices_reduced = np.argsort(cosine_similarities_reduced)[::-1][:10]\n",
    "\n",
    "# Display the most similar tags in the reduced matrix with their scores\n",
    "print(\"Most similar tags (Reduced Matrix):\")\n",
    "for idx in top_indices_reduced:\n",
    "    tag = row_to_tag[idx]\n",
    "    score = cosine_similarities_reduced[idx]\n",
    "    print(f\"{tag}: {score:.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddea5f32",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74897a5c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0c5b32d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ff9a331",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91c66b57",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a830c6cf",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cdc98f0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "150d66f3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "337b1f65",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34d2fde1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fc197d8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfa9c299",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "551a8453",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0dcdeb9e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "537c9e26",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa873abf",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41aca76f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36a3ae96",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb59bac3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39c87db9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1646e731",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99f95d09",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d6a67c2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32acbfd7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c17cd42",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d333776c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e8c7511",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acf35591",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "101fb083",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8bd8551",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "271b9c12",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a232e088",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43df0240",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8dbb05e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9730cb16",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d38f92b2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "879f5463",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}