Spaces:
Runtime error
Runtime error
import tensorflow as tf | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from datasets import load_dataset | |
from transformers import AutoTokenizer, TFAutoModel | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
# Load the dataset | |
dataset = load_dataset("sberhe/2023-1000-software-release-notes") | |
# Load a pre-trained model and tokenizer (TensorFlow version) | |
model_name = "bert-base-uncased" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = TFAutoModel.from_pretrained(model_name) | |
# Tokenize the data | |
def tokenize_function(examples): | |
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) | |
tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
# Function to extract embeddings | |
def extract_embeddings(batch): | |
inputs = {k: tf.convert_to_tensor(v) for k, v in batch.items() if k in tokenizer.model_input_names} | |
outputs = model(**inputs) | |
# Use the embeddings of the [CLS] token ([0]) | |
return {"embeddings": outputs.last_hidden_state[:, 0].numpy()} | |
# Apply the function to extract embeddings in batches | |
embeddings_dataset = tokenized_datasets.map(extract_embeddings, batched=True) | |
# Flatten the embeddings and reduce dimensionality using PCA | |
embeddings = np.vstack(embeddings_dataset['train']['embeddings']) | |
pca = PCA(n_components=2) # Using 2 components for better visualization | |
embeddings_2d = pca.fit_transform(embeddings) | |
# Perform unsupervised clustering (K-Means) | |
num_clusters = 50 | |
kmeans = KMeans(n_clusters=num_clusters) | |
cluster_labels = kmeans.fit_predict(embeddings_2d) | |
# Create a DataFrame with cluster labels and original texts | |
original_texts = [example['text'] for example in dataset['train']] | |
df = pd.DataFrame({'text': original_texts, 'Cluster': cluster_labels}) | |
# ... | |
# TF-IDF calculation and finding representative terms for each cluster | |
vectorizer = TfidfVectorizer(stop_words='english') | |
X_tfidf = vectorizer.fit_transform(df['text']) | |
feature_names = vectorizer.get_feature_names_out() | |
cluster_names = [] | |
for i in range(num_clusters): | |
indices = df[df['Cluster'] == i].index | |
# Aggregate the TF-IDF scores for each feature in cluster i | |
aggregated_tfidf = np.mean(X_tfidf[indices], axis=0) | |
# Convert to array (if it's not already an array) and get the index of the max tf-idf score | |
aggregated_tfidf_array = np.array(aggregated_tfidf).flatten() | |
max_tfidf_index = aggregated_tfidf_array.argmax() | |
cluster_names.append(feature_names[max_tfidf_index]) | |
# Count the size of each cluster | |
cluster_sizes = df['Cluster'].value_counts().sort_index() | |
# Output cluster names and sizes using Streamlit | |
for i in range(num_clusters): | |
cluster_name = cluster_names[i] | |
cluster_size = cluster_sizes.get(i, 0) # Get size with a default of 0 if cluster is empty | |
print(f"Cluster {i+1} (Name: {cluster_name}, Size: {cluster_size})") | |
# ... |