from open_clip import create_model_and_transforms
from template_tokenizer import template_tokenize
import torchvision.transforms as T
import torch
import torch.nn.functional as F
from utils import read_avi

# You'll need to log in to the HuggingFace hub CLI to download the models
# You can do this with the terminal command "huggingface-cli login"
# You'll be asked to paste your HuggingFace API token, which you can find at https://huggingface.co/settings/token

# Use EchoCLIP-R for retrieval-based tasks where you want to find
# the similarity between two echos, like in patient identification or
# echo report retrieval. It has a longer context window because it
# uses the template tokenizer, which we found increases its retrieval
# performance but decreases its performance on other zero-shot tasks.
echo_clip_r, _, preprocess_val = create_model_and_transforms(
    "hf-hub:mkaichristensen/echo-clip-r", precision="bf16"
)

# We'll load a sample echo video and preprocess its frames.
test_video = read_avi(
    "example_video.avi",
    (224, 224),
)
test_video = torch.stack(
    [preprocess_val(T.ToPILImage()(frame)) for frame in test_video], dim=0
)
test_video = test_video.cpu()
test_video = test_video.to(torch.bfloat16)

# Be sure to normalize the CLIP embeddings after calculating them to make
# cosine similarity between embeddings easier to calculate.
test_video_embedding = F.normalize(echo_clip_r.encode_image(test_video), dim=-1)

# To get a single embedding for the entire video, we'll take the mean
# of the 10 frame embeddings.
test_video_embedding = test_video_embedding.mean(dim=0, keepdim=True)

# We'll now load an excerpt of the report associated with our echo
# and tokenize it using the template tokenizer.
with open("example_report.txt", "r") as f:
    test_report = f.read()

template_tokens = template_tokenize(test_report)
template_tokens = torch.tensor(template_tokens, dtype=torch.long).unsqueeze(0).cpu()
print(template_tokens)

# We can then embed the report using EchoCLIP-R.
test_report_embedding = F.normalize(echo_clip_r.encode_text(template_tokens), dim=-1)

print(test_report_embedding.shape)
print(test_video_embedding.shape)

# Since both embeddings are normalized, we can just take the dot product
# to get the cosine similarity between them.
similarity = (test_report_embedding @ test_video_embedding.T).squeeze(0)
print(similarity.item())