| import sklearn |
| import sklearn.cluster |
|
|
| import datasets |
| import tqdm |
| import numpy as np |
|
|
| import torch |
| from llm2vec import LLM2Vec |
|
|
| dataset = "mteb/twentynewsgroups-clustering" |
| instruction = "Identify the topic or theme of the given news articles: " |
|
|
| dataset = datasets.load_dataset(dataset) |
| batch_size = 32 |
|
|
| print("Loading model...") |
| model = LLM2Vec.from_pretrained( |
| "McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp", |
| peft_model_name_or_path="McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised", |
| device_map="cuda" if torch.cuda.is_available() else "cpu", |
| torch_dtype=torch.bfloat16, |
| ) |
|
|
|
|
| def append_instruction(instruction, sentences): |
| new_sentences = [] |
| for s in sentences: |
| new_sentences.append([instruction, s, 0]) |
| return new_sentences |
|
|
|
|
| v_measures = [] |
| for cluster_set in tqdm.tqdm(dataset["test"], desc="Clustering"): |
| sentences = cluster_set["sentences"] |
| labels = cluster_set["labels"] |
| clustering_batch_size = 500 |
|
|
| print(f"Encoding {len(sentences)} sentences...") |
| new_sentences = append_instruction(instruction, sentences) |
| corpus_embeddings = np.asarray(model.encode(new_sentences, batch_size=batch_size)) |
|
|
| print("Fitting Mini-Batch K-Means model...") |
| clustering_model = sklearn.cluster.MiniBatchKMeans( |
| n_clusters=len(set(labels)), batch_size=clustering_batch_size |
| ) |
| clustering_model.fit(corpus_embeddings) |
| cluster_assignment = clustering_model.labels_ |
|
|
| print("Evaluating...") |
| v_measure = sklearn.metrics.cluster.v_measure_score(labels, cluster_assignment) |
| v_measures.append(v_measure) |
|
|
| v_mean = np.mean(v_measures) |
| v_std = np.std(v_measures) |
|
|
| print(v_mean) |
| |
|
|