sberhe's picture
Update app.py
3e4970a
raw
history blame contribute delete
No virus
2.95 kB
import tensorflow as tf
import streamlit as st
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# Load the dataset
dataset = load_dataset("sberhe/2023-1000-software-release-notes")
# Load a pre-trained model and tokenizer (TensorFlow version)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)
# Tokenize the data
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Function to extract embeddings
def extract_embeddings(batch):
inputs = {k: tf.convert_to_tensor(v) for k, v in batch.items() if k in tokenizer.model_input_names}
outputs = model(**inputs)
# Use the embeddings of the [CLS] token ([0])
return {"embeddings": outputs.last_hidden_state[:, 0].numpy()}
# Apply the function to extract embeddings in batches
embeddings_dataset = tokenized_datasets.map(extract_embeddings, batched=True)
# Flatten the embeddings and reduce dimensionality using PCA
embeddings = np.vstack(embeddings_dataset['train']['embeddings'])
pca = PCA(n_components=2) # Using 2 components for better visualization
embeddings_2d = pca.fit_transform(embeddings)
# Perform unsupervised clustering (K-Means)
num_clusters = 50
kmeans = KMeans(n_clusters=num_clusters)
cluster_labels = kmeans.fit_predict(embeddings_2d)
# Create a DataFrame with cluster labels and original texts
original_texts = [example['text'] for example in dataset['train']]
df = pd.DataFrame({'text': original_texts, 'Cluster': cluster_labels})
# ...
# TF-IDF calculation and finding representative terms for each cluster
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(df['text'])
feature_names = vectorizer.get_feature_names_out()
cluster_names = []
for i in range(num_clusters):
indices = df[df['Cluster'] == i].index
# Aggregate the TF-IDF scores for each feature in cluster i
aggregated_tfidf = np.mean(X_tfidf[indices], axis=0)
# Convert to array (if it's not already an array) and get the index of the max tf-idf score
aggregated_tfidf_array = np.array(aggregated_tfidf).flatten()
max_tfidf_index = aggregated_tfidf_array.argmax()
cluster_names.append(feature_names[max_tfidf_index])
# Count the size of each cluster
cluster_sizes = df['Cluster'].value_counts().sort_index()
# Output cluster names and sizes using Streamlit
for i in range(num_clusters):
cluster_name = cluster_names[i]
cluster_size = cluster_sizes.get(i, 0) # Get size with a default of 0 if cluster is empty
print(f"Cluster {i+1} (Name: {cluster_name}, Size: {cluster_size})")
# ...