File size: 2,951 Bytes
680cd70
3a4ba67
 
 
4af8da6
cadd4a6
3a4ba67
 
 
4af8da6
 
3e4970a
4af8da6
cadd4a6
4af8da6
 
cadd4a6
4af8da6
3a4ba67
4af8da6
cadd4a6
4af8da6
3e4970a
4af8da6
beb596a
 
 
3e4970a
 
 
beb596a
c8dc529
3e4970a
c8dc529
3e4970a
 
 
 
208b1c2
846c764
3e4970a
846c764
3e4970a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import tensorflow as tf
import streamlit as st
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load the dataset
dataset = load_dataset("sberhe/2023-1000-software-release-notes")

# Load a pre-trained model and tokenizer (TensorFlow version)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Function to extract embeddings
def extract_embeddings(batch):
    inputs = {k: tf.convert_to_tensor(v) for k, v in batch.items() if k in tokenizer.model_input_names}
    outputs = model(**inputs)
    # Use the embeddings of the [CLS] token ([0])
    return {"embeddings": outputs.last_hidden_state[:, 0].numpy()}

# Apply the function to extract embeddings in batches
embeddings_dataset = tokenized_datasets.map(extract_embeddings, batched=True)

# Flatten the embeddings and reduce dimensionality using PCA
embeddings = np.vstack(embeddings_dataset['train']['embeddings'])
pca = PCA(n_components=2)  # Using 2 components for better visualization
embeddings_2d = pca.fit_transform(embeddings)

# Perform unsupervised clustering (K-Means)
num_clusters = 50
kmeans = KMeans(n_clusters=num_clusters)
cluster_labels = kmeans.fit_predict(embeddings_2d)

# Create a DataFrame with cluster labels and original texts
original_texts = [example['text'] for example in dataset['train']]
df = pd.DataFrame({'text': original_texts, 'Cluster': cluster_labels})

# ...

# TF-IDF calculation and finding representative terms for each cluster
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(df['text'])
feature_names = vectorizer.get_feature_names_out()

cluster_names = []
for i in range(num_clusters):
    indices = df[df['Cluster'] == i].index
    # Aggregate the TF-IDF scores for each feature in cluster i
    aggregated_tfidf = np.mean(X_tfidf[indices], axis=0)
    # Convert to array (if it's not already an array) and get the index of the max tf-idf score
    aggregated_tfidf_array = np.array(aggregated_tfidf).flatten()
    max_tfidf_index = aggregated_tfidf_array.argmax()
    cluster_names.append(feature_names[max_tfidf_index])

# Count the size of each cluster
cluster_sizes = df['Cluster'].value_counts().sort_index()

# Output cluster names and sizes using Streamlit
for i in range(num_clusters):
    cluster_name = cluster_names[i]
    cluster_size = cluster_sizes.get(i, 0)  # Get size with a default of 0 if cluster is empty
    print(f"Cluster {i+1} (Name: {cluster_name}, Size: {cluster_size})")

# ...