File size: 7,978 Bytes
71d3180 1cc6474 71d3180 6c19590 71d3180 6c19590 71d3180 6c19590 71d3180 6c19590 71d3180 6c19590 71d3180 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
"""
Example usage script for LLM2Vec4CXR model.
This demonstrates how to load and use the model for chest X-ray report analysis.
Prerequisites:
1. Install the LLM2Vec4CXR package:
pip install git+https://github.com/lukeingawesome/llm2vec4cxr.git
Or clone and install in development mode:
git clone https://github.com/lukeingawesome/llm2vec4cxr.git
cd llm2vec4cxr
pip install -e .
2. The model will be automatically downloaded from Hugging Face when first used.
"""
import torch
import torch.nn.functional as F
from llm2vec_wrapper import LLM2VecWrapper as LLM2Vec
def load_llm2vec4cxr_model(model_name_or_path="lukeingawesome/llm2vec4cxr"):
"""
Load the LLM2Vec4CXR model with proper configuration.
Args:
model_name_or_path (str): Hugging Face model path or local path
Returns:
tuple: (model, tokenizer)
"""
# Load model with the specific configuration used for LLM2Vec4CXR
model = LLM2Vec.from_pretrained(
base_model_name_or_path=model_name_or_path,
enable_bidirectional=True,
pooling_mode="latent_attention", # This is the key modification
max_length=512,
torch_dtype=torch.bfloat16,
)
# Configure tokenizer
tokenizer = model.tokenizer
tokenizer.padding_side = 'left'
return model, tokenizer
def tokenize_with_separator(texts, tokenizer, max_length=512):
"""
Tokenize texts with special handling for separator-based splitting.
This is useful for instruction-following tasks.
Args:
texts (list): List of texts to tokenize
tokenizer: The tokenizer to use
max_length (int): Maximum sequence length
Returns:
dict: Tokenized inputs with attention masks and embed masks
"""
texts_2 = []
original_texts = []
separator = '!@#$%^&*()'
for text in texts:
parts = text.split(separator)
texts_2.append(parts[1] if len(parts) > 1 else "")
original_texts.append("".join(parts))
# Tokenize original texts
tokenized = tokenizer(
original_texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=max_length,
)
# Create embedding masks for the separated parts
embed_mask = None
for t_i, t in enumerate(texts_2):
ids = tokenizer(
[t],
return_tensors="pt",
padding=True,
truncation=True,
max_length=max_length,
add_special_tokens=False,
)
e_m = torch.zeros_like(tokenized["attention_mask"][t_i])
if len(ids["input_ids"][0]) > 0:
e_m[-len(ids["input_ids"][0]):] = torch.ones(len(ids["input_ids"][0]))
if embed_mask is None:
embed_mask = e_m.unsqueeze(0)
else:
embed_mask = torch.cat((embed_mask, e_m.unsqueeze(0)), dim=0)
tokenized["embed_mask"] = embed_mask
return tokenized
def compute_similarities(model, tokenizer, texts, device):
"""
Compute similarity scores between the first text and all other texts.
Args:
model: The LLM2Vec model
tokenizer: The tokenizer
texts (list): List of texts to compare (first text is the reference)
device: The device to run computations on
Returns:
tuple: (embeddings, similarities)
"""
with torch.no_grad():
# Use separator-based tokenization if texts contain the separator
if any('!@#$%^&*()' in text for text in texts):
tokenized = tokenize_with_separator(texts, tokenizer, 512)
else:
tokenized = tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512,
)
tokenized = tokenized.to(device)
if hasattr(tokenized, 'to'):
tokenized = tokenized.to(torch.bfloat16)
else:
# Convert each tensor in the dict
for key in tokenized:
if torch.is_tensor(tokenized[key]):
tokenized[key] = tokenized[key].to(torch.bfloat16)
embeddings = model(tokenized)
# Compute cosine similarities between first embedding and all others
similarities = F.cosine_similarity(embeddings[0], embeddings[1:], dim=1)
return embeddings, similarities
def main():
"""
Example usage of the LLM2Vec4CXR model for chest X-ray report analysis.
"""
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# Load the model
print("Loading LLM2Vec4CXR model...")
model, tokenizer = load_llm2vec4cxr_model()
model = model.to(device).to(torch.bfloat16)
model.eval()
# Example 1: Basic text embedding using built-in method
print("\n" + "="*60)
print("Example 1: Basic Text Embedding (Built-in Method)")
print("="*60)
report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
# Use the convenient built-in method
embedding = model.encode_text(report)
print(f"Report: {report}")
print(f"Embedding shape: {embedding.shape}")
print(f"Embedding norm: {torch.norm(embedding).item():.4f}")
# Example 2: Instruction-based similarity comparison
print("\n" + "="*60)
print("Example 2: Instruction-based Similarity Comparison")
print("="*60)
separator = '!@#$%^&*()'
instruction = 'Determine the change or the status of the pleural effusion.'
report = 'There is a small increase in the left-sided effusion. There continues to be volume loss at both bases.'
text = instruction + separator + report
comparison_options = [
'No pleural effusion',
'Pleural effusion',
'Effusion is seen in the right',
'Effusion is seen in the left',
'Pleural effusion is improving',
'Pleural effusion is stable',
'Pleural effusion is worsening'
]
all_texts = [text] + comparison_options
# Use built-in method for instruction-based encoding
embeddings = model.encode_with_instruction(all_texts)
similarities = F.cosine_similarity(embeddings[0], embeddings[1:], dim=1)
print(f"Original text: {report}")
print(f"Instruction: {instruction}")
print("\nSimilarity Scores:")
print("-" * 50)
for option, score in zip(comparison_options, similarities):
print(f"{option:<35} | {score.item():.4f}")
# Find the most similar option
best_match_idx = torch.argmax(similarities).item()
print(f"\nBest match: {comparison_options[best_match_idx]} (score: {similarities[best_match_idx].item():.4f})")
# Example 3: Multiple report comparison
print("\n" + "="*60)
print("Example 3: Multiple Report Comparison")
print("="*60)
reports = [
"No acute cardiopulmonary abnormality.",
"Small bilateral pleural effusions.",
"Large left pleural effusion with compressive atelectasis.",
"Interval improvement in bilateral pleural effusions.",
"Worsening bilateral pleural effusions."
]
print("Computing embeddings for multiple reports...")
# Use built-in method for multiple texts
embeddings = model.encode_text(reports)
# Compute pairwise similarities
similarity_matrix = F.cosine_similarity(
embeddings.unsqueeze(1),
embeddings.unsqueeze(0),
dim=2
)
print("\nPairwise Similarity Matrix:")
print("-" * 30)
for i, report1 in enumerate(reports):
print(f"Report {i+1}: {report1[:30]}...")
for j, report2 in enumerate(reports):
print(f" vs Report {j+1}: {similarity_matrix[i][j].item():.4f}")
print()
if __name__ == "__main__":
main()
|