| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | from transformers import AutoModel, AutoTokenizer |
| | import torch |
| |
|
| | REPO_ID = "AbstractPhil/geolip-captionbert-8192" |
| |
|
| | print("Loading model...") |
| | model = AutoModel.from_pretrained(REPO_ID, trust_remote_code=True) |
| | model.eval() |
| | print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}") |
| |
|
| | print("Loading tokenizer...") |
| | tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True) |
| | print(f" Vocab: {tokenizer.vocab_size}") |
| |
|
| | |
| | texts = [ |
| | "girl", |
| | "boy", |
| | "woman", |
| | "man", |
| | "mans", |
| | "womens", |
| | "women", |
| | "woman", |
| | "adjacency", |
| | "adjacent", |
| | "nearby", |
| | "near", |
| | "away", |
| | "aways", |
| | "similar", |
| | "dissimilar", |
| | "solid", |
| | "liquid", |
| | "prophetic", |
| | "predictive", |
| | "similarity", |
| | "differentiation", |
| | "differential", |
| | "addition", |
| | "subtraction", |
| | "division", |
| | "multiplication" |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | ] |
| |
|
| | inputs = tokenizer(texts, max_length=8192, padding=True, |
| | truncation=True, return_tensors="pt") |
| |
|
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| |
|
| | emb = outputs.last_hidden_state |
| | print(f"\n Output shape: {emb.shape}") |
| | print(f" Norms: {emb.norm(dim=-1).tolist()}") |
| |
|
| | |
| | print(f"\n Pairwise cosine similarity:") |
| | sim = emb @ emb.T |
| | for i in range(len(texts)): |
| | for j in range(i+1, len(texts)): |
| | print(f" [{i}]↔[{j}]: {sim[i,j]:.3f} ({texts[i][:40]}↔{texts[j][:40]})") |
| |
|
| | |
| | if hasattr(model, 'encode'): |
| | print(f"\n Testing encode() method...") |
| | e = model.encode(["Hello world", "Testing the encoder"]) |
| | print(f" Shape: {e.shape}") |
| | print(f" Cosine: {(e[0] @ e[1]).item():.3f}") |
| |
|
| | print("\n✓ All tests passed") |