Spaces:
Runtime error
Runtime error
File size: 2,951 Bytes
680cd70 3a4ba67 4af8da6 cadd4a6 3a4ba67 4af8da6 3e4970a 4af8da6 cadd4a6 4af8da6 cadd4a6 4af8da6 3a4ba67 4af8da6 cadd4a6 4af8da6 3e4970a 4af8da6 beb596a 3e4970a beb596a c8dc529 3e4970a c8dc529 3e4970a 208b1c2 846c764 3e4970a 846c764 3e4970a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import tensorflow as tf
import streamlit as st
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# Load the dataset
dataset = load_dataset("sberhe/2023-1000-software-release-notes")
# Load a pre-trained model and tokenizer (TensorFlow version)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)
# Tokenize the data
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Function to extract embeddings
def extract_embeddings(batch):
inputs = {k: tf.convert_to_tensor(v) for k, v in batch.items() if k in tokenizer.model_input_names}
outputs = model(**inputs)
# Use the embeddings of the [CLS] token ([0])
return {"embeddings": outputs.last_hidden_state[:, 0].numpy()}
# Apply the function to extract embeddings in batches
embeddings_dataset = tokenized_datasets.map(extract_embeddings, batched=True)
# Flatten the embeddings and reduce dimensionality using PCA
embeddings = np.vstack(embeddings_dataset['train']['embeddings'])
pca = PCA(n_components=2) # Using 2 components for better visualization
embeddings_2d = pca.fit_transform(embeddings)
# Perform unsupervised clustering (K-Means)
num_clusters = 50
kmeans = KMeans(n_clusters=num_clusters)
cluster_labels = kmeans.fit_predict(embeddings_2d)
# Create a DataFrame with cluster labels and original texts
original_texts = [example['text'] for example in dataset['train']]
df = pd.DataFrame({'text': original_texts, 'Cluster': cluster_labels})
# ...
# TF-IDF calculation and finding representative terms for each cluster
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(df['text'])
feature_names = vectorizer.get_feature_names_out()
cluster_names = []
for i in range(num_clusters):
indices = df[df['Cluster'] == i].index
# Aggregate the TF-IDF scores for each feature in cluster i
aggregated_tfidf = np.mean(X_tfidf[indices], axis=0)
# Convert to array (if it's not already an array) and get the index of the max tf-idf score
aggregated_tfidf_array = np.array(aggregated_tfidf).flatten()
max_tfidf_index = aggregated_tfidf_array.argmax()
cluster_names.append(feature_names[max_tfidf_index])
# Count the size of each cluster
cluster_sizes = df['Cluster'].value_counts().sort_index()
# Output cluster names and sizes using Streamlit
for i in range(num_clusters):
cluster_name = cluster_names[i]
cluster_size = cluster_sizes.get(i, 0) # Get size with a default of 0 if cluster is empty
print(f"Cluster {i+1} (Name: {cluster_name}, Size: {cluster_size})")
# ... |