In [2]:
import csv
import gzip
from math import log
from collections import Counter
from sys import maxsize
import numpy as np
import joblib
from collections import OrderedDict
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import sys
from scipy.sparse import dok_matrix
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD



posts_file = 'posts-2024-04-14.csv.gz'
fluffyrock_tags_list_file = 'fluffyrock_3m.csv'


def extract_artist_names(file_path):
 """
 Extract artist names from a CSV file where each row contains tag information,
 and the first column contains the tag's name. Artist tags start with 'by_'.

 :param file_path: Path to the CSV file
 :return: A set containing artist names without the 'by_' prefix
 """
 artists = set()

 # Open the CSV file and read it
 with open(file_path, newline='', encoding='utf-8') as csvfile:
 reader = csv.reader(csvfile)
 
 # Iterate over each row in the CSV file
 for row in reader:
 tag_name = row[0] # Assuming the first column contains the tag names
 if tag_name.startswith('by_'):
 # Strip 'by_' from the start of the tag name and add it to the set
 artist_name = tag_name[3:] # Remove the first three characters 'by_'
 artists.add(tag_name)

 return artists


def build_tag_list(tags, e621_rating_character, fav_count, artist_names):
 results = []
 
 #score
 score_value = min(1.0, (log(int(fav_count)+1) / 10))
 rounded_score_value = round(score_value * 10)
 results.append(f"score: {rounded_score_value}")
 
 #rating
 results.append("rating:" + e621_rating_character)
 
 #regular tags and artists
 for tag in tags:
 if tag in artist_names:
 results.append("by_" + tag)
 else:
 results.append(tag)
 return results


def read_csv_as_dict(file_path):
 """
 Generator function to read a gzipped CSV file and yield each row as a dictionary
 where keys are the column names and values are the data in each column.

 :param file_path: Path to the .csv.gz file
 """
 
 #counter=0
 with gzip.open(file_path, 'rt', newline='', encoding='utf-8') as gz_file:
 csv.field_size_limit(1000000)
 reader = csv.DictReader(gz_file)
 for row in reader:
 #counter += 1
 #if counter % 100 == 0:
 yield row
 
 
def process_tags_from_csv(file_path, artist_names):
 """
 Generator function that reads rows from a CSV file, processes each row to extract and
 build tag lists, and yields these lists one at a time.

 :param file_path: The path to the gzipped CSV file.
 :param artist_names: A set containing all artist names for tag processing.
 :return: Yields lists of tags for each row.
 """
 for row in read_csv_as_dict(file_path):
 base_tags = row['tag_string'].split(' ')
 rating_character = row['rating']
 fav_count = row['fav_count']
 all_tags = build_tag_list(base_tags, rating_character, fav_count, artist_names)
 yield all_tags
 
 
def construct_pseudo_vector(pseudo_doc_terms, idf_loaded, tag_to_column_loaded):
 # Initialize a vector of zeros with the length of the term_to_index mapping
 pseudo_vector = np.zeros(len(tag_to_column_loaded))
 
 # Fill in the vector for terms in the pseudo document
 for term in pseudo_doc_terms:
 if term in tag_to_column_loaded:
 index = tag_to_column_loaded[term]
 pseudo_vector[index] = idf_loaded.get(term, 0)
 
 # Return the vector as a 2D array for compatibility with SVD transform
 return pseudo_vector.reshape(1, -1)

In [None]:
all_artist_names = extract_artist_names(fluffyrock_tags_list_file)

tag_count = Counter()
min_occurrences = 200
 
for all_tags in process_tags_from_csv(posts_file, all_artist_names):
 tag_count.update(all_tags)
 

# Apply the counting logic from the first code snippet
sorted_tags = tag_count.most_common()
filtered_tags = [tag for tag, count in sorted_tags if count >= min_occurrences]

# Print tag counts before and after filtering
print("Tag count before filtering: ", len(tag_count))
print("Tag count after filtering: ", len(filtered_tags))

In [None]:
# Initialize a dictionary to hold the co-occurrences for each tag in filtered_tags
# Using a nested defaultdict for automatic handling of missing keys
pseudo_docs = defaultdict(lambda: defaultdict(int))

# Number of tags processed
total_rows_processed = 0

# Read each row and process the tags
for all_tags in process_tags_from_csv(posts_file, all_artist_names):
 # Filter the tags in the current list to include only those in filtered_tags
 filtered_tag_list = [tag for tag in all_tags if tag in filtered_tags]
 
 # For each tag in the filtered list
 for tag in filtered_tag_list:
 # For each co-occurring tag in the same list
 for co_occur_tag in filtered_tag_list:
 if co_occur_tag != tag:
 pseudo_docs[tag][co_occur_tag] += 1

 # Counting total tags processed for progress monitoring
 total_rows_processed += 1
 if total_rows_processed % 10000 == 0:
 print(f"Processed {total_rows_processed} rows", file=sys.stderr)

print("Processing complete.")


In [None]:
# Number of pseudo-documents
N = len(pseudo_docs)

# Calculate TF and DF
tf = {}
df = {}
for doc, terms in pseudo_docs.items():
 tf[doc] = {}
 total_terms = sum(terms.values())
 for term, count in terms.items():
 tf[doc][term] = count / total_terms # Term Frequency
 df[term] = df.get(term, 0) + 1 # Document Frequency
 
# Ensure all terms are indexed
all_terms = set(df.keys())
term_to_column_index = {term: idx for idx, term in enumerate(all_terms)}

# Calculate IDF
idf = {term: log((N + 1) / (df_val + 1)) for term, df_val in df.items()} # Adding 1 to prevent division by zero

# Initialize the TF-IDF matrix
tfidf_matrix = dok_matrix((N, len(df)), dtype=float)

# Mapping of tags to matrix rows
tag_to_row = {tag: idx for idx, tag in enumerate(pseudo_docs)}

# Compute TF-IDF and fill the matrix
for doc, terms in tf.items():
 row_idx = tag_to_row[doc]
 for term, tf_val in terms.items():
 col_idx = term_to_column_index[term] # Use term_to_index for column indexing
 tfidf_matrix[row_idx, col_idx] = tf_val * idf[term]

# Convert to CSR format for efficient row slicing
tfidf_matrix = tfidf_matrix.tocsr()

print("TF-IDF matrix shape:", tfidf_matrix.shape)


In [None]:
# Choose the number of components for the reduced dimensionality
n_components = 300 # For example, reducing to 300 dimensions

# Initialize the TruncatedSVD object
svd = TruncatedSVD(n_components=n_components, random_state=42)

# Fit and transform the TF-IDF matrix
reduced_matrix = svd.fit_transform(tfidf_matrix)

# 'reduced_matrix' now has a shape of (8500, n_components), e.g., (8500, 300)

In [None]:
# Step 1: Construct TF vector for the pseudo-document
pseudo_doc_terms = ["female"]
pseudo_tfidf_vector = construct_pseudo_vector(pseudo_doc_terms, idf, term_to_column_index)

# Assuming 'tfidf_matrix' is your original TF-IDF matrix and 'reduced_matrix' is obtained from Truncated SVD
# 'pseudo_tfidf_vector' is the TF-IDF vector for your pseudo-document, constructed as previously discussed

# For the original TF-IDF matrix
# Compute cosine similarities
cosine_similarities_full = cosine_similarity(pseudo_tfidf_vector, tfidf_matrix).flatten()
print("Cosine similarities (full matrix):", cosine_similarities_full)
# Identify the indices of the top 10 most similar tags
top_indices_full = np.argsort(cosine_similarities_full)[-10:][::-1]

# For the reduced matrix
# Reduce the dimensionality of the pseudo-document vector
# Before calculating similarities, print the TF-IDF vectors
print("Pseudo TF-IDF vector:", pseudo_tfidf_vector)
reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)
print("Reduced pseudo-document vector:", reduced_pseudo_vector)

# Compute cosine similarities in the reduced space
cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()
print("Cosine similarities (reduced matrix):", cosine_similarities_reduced)


# Identify the indices of the top 10 most similar tags in the reduced space, sorted from most to least similar
top_indices_reduced = np.argsort(cosine_similarities_reduced)[-10:][::-1]


# Convert indices to tag names using the inverse of your 'tag_to_row' mapping
# Printing the tag to index and index to tag mappings
print("tag_to_row mapping (partial):", dict(list(tag_to_row.items())[:12])) # Print only first 10 for brevity
row_to_tag = {idx: tag for tag, idx in tag_to_row.items()}
print("row_to_tag mapping (partial):", dict(list(row_to_tag.items())[:12]))

# Generate lists of tags with their corresponding similarity scores
top_tags_full = [(row_to_tag[idx], cosine_similarities_full[idx]) for idx in top_indices_full]
top_tags_reduced = [(row_to_tag[idx], cosine_similarities_reduced[idx]) for idx in top_indices_reduced]

# Output the results with scores
print("Most similar tags (Full Matrix):")
for tag, score in top_tags_full:
 print(f"{tag}: {score:.4f}")

print("Most similar tags (Reduced Matrix):")
for tag, score in top_tags_reduced:
 print(f"{tag}: {score:.4f}")


In [None]:
#Save the model to a file

# Package necessary components
components_to_save = {
 'idf': idf,
 'tag_to_column_index': term_to_column_index,
 'row_to_tag': row_to_tag, 
 'reduced_matrix': reduced_matrix,
 'svd_model': svd
}

# Save the components into a file
joblib.dump(components_to_save, 'components_file418.joblib')

In [3]:
#Reload and test file

# Load the saved components from the joblib file
components = joblib.load('tf_idf_files_418_updated.joblib')

# Extract necessary components
idf = components['idf']
term_to_column_index = components['tag_to_column_index']
row_to_tag = components['row_to_tag']
reduced_matrix = components['reduced_matrix']
svd = components['svd_model']

# Construct the TF-IDF vector for "domestic_dog"
pseudo_tfidf_vector = construct_pseudo_vector("blue_(jurassic_world)", idf, term_to_column_index)

# Reduce the dimensionality of the pseudo-document vector for the reduced matrix
reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)

# Compute cosine similarities in the reduced space
cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()

# Sort the indices by descending cosine similarity
top_indices_reduced = np.argsort(cosine_similarities_reduced)[::-1][:10]

# Display the most similar tags in the reduced matrix with their scores
print("Most similar tags (Reduced Matrix):")
for idx in top_indices_reduced:
 tag = row_to_tag[idx]
 score = cosine_similarities_reduced[idx]
 print(f"{tag}: {score:.4f}")


Most similar tags (Reduced Matrix):
nameless_(arbuzbudesh): 0.0000
knotted_dildo: 0.0000
black_legs: 0.0000
disguise: 0.0000
lineup: 0.0000
olympics: 0.0000
burping: 0.0000
pink_collar: 0.0000
team_rocket: 0.0000
studded_bracelet: 0.0000
