sberhe commited on
Commit
3e4970a
1 Parent(s): e8f5a2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -56
app.py CHANGED
@@ -6,11 +6,10 @@ from datasets import load_dataset
6
  from transformers import AutoTokenizer, TFAutoModel
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.cluster import KMeans
9
- import matplotlib.pyplot as plt
10
  from sklearn.decomposition import PCA
11
 
12
  # Load the dataset
13
- dataset = load_dataset("sberhe/2023-3-software-release-notes")
14
 
15
  # Load a pre-trained model and tokenizer (TensorFlow version)
16
  model_name = "bert-base-uncased"
@@ -21,67 +20,56 @@ model = TFAutoModel.from_pretrained(model_name)
21
  def tokenize_function(examples):
22
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
23
 
24
- # Tokenize and preprocess in batches
25
- batch_size = 8 # You can adjust this based on your available memory
26
- tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=batch_size)
27
 
28
  # Function to extract embeddings
29
  def extract_embeddings(batch):
30
  inputs = {k: tf.convert_to_tensor(v) for k, v in batch.items() if k in tokenizer.model_input_names}
31
- outputs = model(**inputs, output_hidden_states=True, return_dict=True)
32
- embeddings = outputs.last_hidden_state
33
- return {"embeddings": embeddings.numpy()} if "embeddings" in outputs else {}
34
 
35
  # Apply the function to extract embeddings in batches
36
- embeddings_dataset = tokenized_datasets.map(extract_embeddings, batched=True, batch_size=batch_size)
37
 
38
- # Tokenize the text and convert to numerical values
39
- inputs = tokenizer(tokenized_datasets["train"]["text"], return_tensors="tf", padding=True, truncation=True, max_length=512)
40
- outputs = model(**inputs)
41
-
42
- # Access the embeddings
43
- #embeddings = np.vstack(embeddings_dataset["train"]["text"])
44
- embeddings = outputs.last_hidden_state
45
-
46
- # Access the embeddings
47
- # Debugging code to print dataset keys
48
- st.write("Dataset Keys:", embeddings_dataset.column_names)
49
- embeddings_numpy = embeddings.numpy()
50
- embeddings_reshaped = embeddings_numpy.reshape(-1, 1) # Adjust the shape as needed
51
-
52
- # Reduce dimensionality using PCA
53
- pca = PCA(n_components=1) # You can adjust the number of components as needed
54
- embeddings_2d_reduced = pca.fit_transform(embeddings_reshaped)
55
 
56
  # Perform unsupervised clustering (K-Means)
57
- num_clusters = 5 # You can adjust this based on your data
58
  kmeans = KMeans(n_clusters=num_clusters)
59
- cluster_labels = kmeans.fit_predict(embeddings_2d_reduced)
60
-
61
- # Create a DataFrame with cluster labels and PCA results
62
- # Adjust the number of components based on data shape
63
- num_components = min(embeddings_2d_reduced.shape[0], embeddings_2d_reduced.shape[1])
64
- pca = PCA(n_components=num_components)
65
- pca_result = pca.fit_transform(embeddings_2d_reduced)
66
- df = pd.DataFrame({'Cluster': cluster_labels, 'PCA1': pca_result[:, 0]})
67
-
68
- # Streamlit app
69
- st.title("Software Release Clustering")
70
-
71
- # Display the clusters on a scatter plot
72
- st.write("Cluster Visualization:")
73
- fig, ax = plt.subplots()
74
- scatter = ax.scatter(df['PCA1'], df['PCA1'], c=df['Cluster'], cmap='viridis')
75
-
76
- ax.set_xlabel('PCA1')
77
- ax.set_ylabel('PCA2')
78
- st.pyplot(fig)
79
-
80
- # Display the number of releases in each cluster
81
- st.write("Cluster Counts:")
82
- st.write(df['Cluster'].value_counts())
83
-
84
- # Display the details of a selected cluster
85
- selected_cluster = st.selectbox("Select a Cluster to Explore", sorted(df['Cluster'].unique()))
86
- st.write("Releases in Cluster", selected_cluster)
87
- st.write(dataset['train'][df['Cluster'] == selected_cluster])
 
 
 
 
 
6
  from transformers import AutoTokenizer, TFAutoModel
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.cluster import KMeans
 
9
  from sklearn.decomposition import PCA
10
 
11
  # Load the dataset
12
+ dataset = load_dataset("sberhe/2023-1000-software-release-notes")
13
 
14
  # Load a pre-trained model and tokenizer (TensorFlow version)
15
  model_name = "bert-base-uncased"
 
20
  def tokenize_function(examples):
21
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
22
 
23
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
 
 
24
 
25
  # Function to extract embeddings
26
  def extract_embeddings(batch):
27
  inputs = {k: tf.convert_to_tensor(v) for k, v in batch.items() if k in tokenizer.model_input_names}
28
+ outputs = model(**inputs)
29
+ # Use the embeddings of the [CLS] token ([0])
30
+ return {"embeddings": outputs.last_hidden_state[:, 0].numpy()}
31
 
32
  # Apply the function to extract embeddings in batches
33
+ embeddings_dataset = tokenized_datasets.map(extract_embeddings, batched=True)
34
 
35
+ # Flatten the embeddings and reduce dimensionality using PCA
36
+ embeddings = np.vstack(embeddings_dataset['train']['embeddings'])
37
+ pca = PCA(n_components=2) # Using 2 components for better visualization
38
+ embeddings_2d = pca.fit_transform(embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Perform unsupervised clustering (K-Means)
41
+ num_clusters = 50
42
  kmeans = KMeans(n_clusters=num_clusters)
43
+ cluster_labels = kmeans.fit_predict(embeddings_2d)
44
+
45
+ # Create a DataFrame with cluster labels and original texts
46
+ original_texts = [example['text'] for example in dataset['train']]
47
+ df = pd.DataFrame({'text': original_texts, 'Cluster': cluster_labels})
48
+
49
+ # ...
50
+
51
+ # TF-IDF calculation and finding representative terms for each cluster
52
+ vectorizer = TfidfVectorizer(stop_words='english')
53
+ X_tfidf = vectorizer.fit_transform(df['text'])
54
+ feature_names = vectorizer.get_feature_names_out()
55
+
56
+ cluster_names = []
57
+ for i in range(num_clusters):
58
+ indices = df[df['Cluster'] == i].index
59
+ # Aggregate the TF-IDF scores for each feature in cluster i
60
+ aggregated_tfidf = np.mean(X_tfidf[indices], axis=0)
61
+ # Convert to array (if it's not already an array) and get the index of the max tf-idf score
62
+ aggregated_tfidf_array = np.array(aggregated_tfidf).flatten()
63
+ max_tfidf_index = aggregated_tfidf_array.argmax()
64
+ cluster_names.append(feature_names[max_tfidf_index])
65
+
66
+ # Count the size of each cluster
67
+ cluster_sizes = df['Cluster'].value_counts().sort_index()
68
+
69
+ # Output cluster names and sizes using Streamlit
70
+ for i in range(num_clusters):
71
+ cluster_name = cluster_names[i]
72
+ cluster_size = cluster_sizes.get(i, 0) # Get size with a default of 0 if cluster is empty
73
+ print(f"Cluster {i+1} (Name: {cluster_name}, Size: {cluster_size})")
74
+
75
+ # ...