AZIIIIIIIIZ's picture
Upload 1039 files
d670799 verified
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os.path as osp
import pandas as pd
DATA_DIR = '../../../data/video_retrieval/msrvtt'
SUFFIX = '.mp4'
raw_data_path = osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_data.json')
train_csv_path = [
osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_train.9k.csv'),
osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_train.7k.csv')
]
test_csv_path = osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_JSFUSION_test.csv')
train_json_path = [
osp.join(DATA_DIR, 'train_9k.json'),
osp.join(DATA_DIR, 'train_7k.json')
]
test_json_path = osp.join(DATA_DIR, 'test_JSFUSION.json')
with open(raw_data_path, 'r') as f:
data = json.load(f)
sentences = data['sentences']
video_dict = {}
for sentence in sentences:
caption = sentence['caption']
video_id = sentence['video_id']
if video_id not in video_dict:
video_dict[video_id] = []
video_dict[video_id].append(caption)
for ip, op in zip(train_csv_path, train_json_path):
train_csv = pd.read_csv(ip)
train_video_ids = list(train_csv['video_id'].values)
train_video_dict = {}
for video_id in train_video_ids:
train_video_dict[video_id + SUFFIX] = video_dict[video_id]
with open(op, 'w') as f:
json.dump(train_video_dict, f)
test_data = pd.read_csv(test_csv_path)
test_video_dict = {}
for video_id, sentence in zip(test_data['video_id'], test_data['sentence']):
test_video_dict[video_id + SUFFIX] = [sentence]
with open(test_json_path, 'w') as f:
json.dump(test_video_dict, f)