#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Jul 17 06:46:02 PM EDT 2022 author: Ryan Hildebrandt, github.com/ryancahildebrandt """ # imports import pandas as pd import random from hdbscan import HDBSCAN from sklearn.cluster import AffinityPropagation from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import Birch from sklearn.cluster import DBSCAN from sklearn.cluster import KMeans from sklearn.cluster import MeanShift from sklearn.cluster import MiniBatchKMeans from sklearn.cluster import OPTICS from sklearn.cluster import SpectralClustering random.seed(42) def cluster_kmeans(in_embs, n_clusters, algorithm): """ KMeans(n_clusters=8, *, init='k-means++', n_init=10, max_iter=300, tol=0.0001, verbose=0, random_state=None, copy_x=True, algorithm='lloyd') """ return KMeans(n_clusters = n_clusters, algorithm = algorithm).fit(in_embs).labels_ def cluster_affinity(in_embs): """ AffinityPropagation(*, damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False, random_state=None)[source]¶ """ return AffinityPropagation().fit(in_embs).labels_ def cluster_agglom(in_embs, n_clusters, affinity, linkage): """ AgglomerativeClustering(n_clusters=2, *, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None, compute_distances=False)[source]¶ """ return AgglomerativeClustering(n_clusters = n_clusters, affinity = affinity, linkage = linkage).fit(in_embs).labels_ def cluster_birch(in_embs, branching_factor, n_clusters): """ Birch(*, threshold=0.5, branching_factor=50, n_clusters=3, compute_labels=True, copy=True)[source]¶ """ return Birch(branching_factor = branching_factor, n_clusters = n_clusters).fit(in_embs).labels_ def cluster_dbscan(in_embs, eps, min_samples, metric): """ DBSCAN(eps=0.5, *, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)[source]¶ """ return DBSCAN(eps = eps, min_samples = min_samples, metric = metric).fit(in_embs).labels_ def cluster_minikmeans(in_embs, n_clusters): """ MiniBatchKMeans(n_clusters=8, *, init='k-means++', max_iter=100, batch_size=1024, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01)[source]¶ """ return MiniBatchKMeans(n_clusters = n_clusters).fit(in_embs).labels_ def cluster_meanshift(in_embs, bin_seeding, cluster_all): """ MeanShift(*, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300) """ return MeanShift(bin_seeding = bin_seeding, cluster_all = cluster_all).fit(in_embs).labels_ def cluster_optics(in_embs, min_samples, metric, min_cluster_size): """ OPTICS(*, min_samples=5, max_eps=inf, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, memory=None, n_jobs=None)[source]¶ """ return OPTICS(min_samples = min_samples, metric = metric, min_cluster_size = min_cluster_size).fit(in_embs).labels_ def cluster_spectral(in_embs, n_clusters, affinity): """ SpectralClustering(n_clusters=8, *, eigen_solver=None, n_components=None, random_state=None, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None, verbose=False)[source]¶ """ return SpectralClustering(n_clusters = n_clusters, affinity = affinity).fit(in_embs).labels_ def cluster_hdbscan(in_embs, alpha, metric, min_cluster_size): """ HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=True, leaf_size=40, memory=Memory(cachedir=None), metric='euclidean', min_cluster_size=5, min_samples=None, p=None) """ return HDBSCAN(alpha = alpha, metric = metric, min_cluster_size = min_cluster_size).fit(in_embs).labels_ metrics_list = ["cityblock", "cosine", "euclidean", "l1", "l2", "manhattan", "braycurtis", "canberra", "chebyshev", "correlation", "dice", "hamming", "jaccard", "kulsinski", "mahalanobis", "minkowski", "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule"] def cluster_ex(in_text, labels): out = pd.DataFrame({"Text" : in_text , "Cluster" : labels}).sort_values(by = "Cluster") return out