import tensorflow as tf import streamlit as st import pandas as pd import numpy as np from datasets import load_dataset from transformers import AutoTokenizer, TFAutoModel from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.decomposition import PCA # Load the dataset dataset = load_dataset("sberhe/2023-1000-software-release-notes") # Load a pre-trained model and tokenizer (TensorFlow version) model_name = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) model = TFAutoModel.from_pretrained(model_name) # Tokenize the data def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) tokenized_datasets = dataset.map(tokenize_function, batched=True) # Function to extract embeddings def extract_embeddings(batch): inputs = {k: tf.convert_to_tensor(v) for k, v in batch.items() if k in tokenizer.model_input_names} outputs = model(**inputs) # Use the embeddings of the [CLS] token ([0]) return {"embeddings": outputs.last_hidden_state[:, 0].numpy()} # Apply the function to extract embeddings in batches embeddings_dataset = tokenized_datasets.map(extract_embeddings, batched=True) # Flatten the embeddings and reduce dimensionality using PCA embeddings = np.vstack(embeddings_dataset['train']['embeddings']) pca = PCA(n_components=2) # Using 2 components for better visualization embeddings_2d = pca.fit_transform(embeddings) # Perform unsupervised clustering (K-Means) num_clusters = 50 kmeans = KMeans(n_clusters=num_clusters) cluster_labels = kmeans.fit_predict(embeddings_2d) # Create a DataFrame with cluster labels and original texts original_texts = [example['text'] for example in dataset['train']] df = pd.DataFrame({'text': original_texts, 'Cluster': cluster_labels}) # ... # TF-IDF calculation and finding representative terms for each cluster vectorizer = TfidfVectorizer(stop_words='english') X_tfidf = vectorizer.fit_transform(df['text']) feature_names = vectorizer.get_feature_names_out() cluster_names = [] for i in range(num_clusters): indices = df[df['Cluster'] == i].index # Aggregate the TF-IDF scores for each feature in cluster i aggregated_tfidf = np.mean(X_tfidf[indices], axis=0) # Convert to array (if it's not already an array) and get the index of the max tf-idf score aggregated_tfidf_array = np.array(aggregated_tfidf).flatten() max_tfidf_index = aggregated_tfidf_array.argmax() cluster_names.append(feature_names[max_tfidf_index]) # Count the size of each cluster cluster_sizes = df['Cluster'].value_counts().sort_index() # Output cluster names and sizes using Streamlit for i in range(num_clusters): cluster_name = cluster_names[i] cluster_size = cluster_sizes.get(i, 0) # Get size with a default of 0 if cluster is empty print(f"Cluster {i+1} (Name: {cluster_name}, Size: {cluster_size})") # ...