| import json |
| import pandas as pd |
| from sentence_transformers import SentenceTransformer |
| from pathlib import Path |
| from tqdm import tqdm |
|
|
| def extract_caption(text_block): |
| for line in text_block.splitlines(): |
| if "CAPTION:" in line.upper(): |
| return line.split("CAPTION:")[-1].strip() |
| return "" |
|
|
| def load_captions_from_files(json_files): |
| all_paths = [] |
| all_captions = [] |
|
|
| for json_path in tqdm(json_files, desc="Reading files"): |
| with open(json_path, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
|
|
| for img_path, outer_list in data.items(): |
| if not outer_list or not outer_list[0]: |
| continue |
| text_block = outer_list[0][0] |
| caption = extract_caption(text_block) |
| if caption: |
| all_paths.append(img_path) |
| all_captions.append(caption) |
|
|
| return all_paths, all_captions |
|
|
| def compute_and_save_embeddings(json_files, output_csv): |
| model = SentenceTransformer('all-MiniLM-L6-v2') |
| image_paths, captions = load_captions_from_files(json_files) |
|
|
| if not captions: |
| print("No valid captions found across input files.") |
| return |
|
|
| embeddings = model.encode(captions, show_progress_bar=True) |
| df = pd.DataFrame(embeddings) |
| df.insert(0, "image_path", image_paths) |
| df.to_csv(output_csv, index=False) |
| print(f"Saved {len(df)} embeddings from {len(json_files)} files to {output_csv}") |
|
|
| |
| if __name__ == "__main__": |
| import glob |
| |
| files = glob.glob("./MBD_text/*.json") |
| compute_and_save_embeddings(files, "combined_caption_embeddings.csv") |
|
|