File size: 1,274 Bytes
631a798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import torch
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
with open("../data/sample_sentences.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Create embeddings
embeddings = model.encode(sentences)

# PCA Visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(embeddings)

plt.figure(figsize=(8,6))
plt.scatter(pca_result[:,0], pca_result[:,1])
for i, txt in enumerate(sentences):
    plt.annotate(txt, (pca_result[i,0], pca_result[i,1]))
plt.title("Text Embeddings (PCA)")
plt.show()

# t-SNE Visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
tsne_result = tsne.fit_transform(embeddings)

plt.figure(figsize=(8,6))
plt.scatter(tsne_result[:,0], tsne_result[:,1])
for i, txt in enumerate(sentences):
    plt.annotate(txt, (tsne_result[i,0], tsne_result[i,1]))
plt.title("Text Embeddings (t-SNE)")
plt.show()