import json import argparse def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--review_data_path', type=str, default='./data') parser.add_argument('--meta_data_path', type=str, default='./data') parser.add_argument('--save_dir', type=str, default='./data/dataset') parser.add_argument('--dataset', type=str, default='dataset') return parser.parse_args() args = parse_args() import os if not os.path.exists(args.save_dir + '/' + args.dataset): os.mkdir(args.save_dir + '/' + args.dataset) # /datain/v-yinju/LLMBased_Multimodal_RS/Data/Musical_Instruments ''' Extract interaction sequence ''' inters = {} with open(args.review_data_path, 'r', encoding = 'utf-8') as file: # with open('/datain/v-yinju/LLMBased_Multimodal_RS/Data/Musical_Instruments.json', 'r', encoding = 'utf-8') as file: for line in file: element = json.loads(line) if element['reviewerID'] in inters: inters[element['reviewerID']].append({'time': element['unixReviewTime'], 'item': element['asin']}) else: inters[element['reviewerID']] = [{'time': element['unixReviewTime'], 'item': element['asin']}] # Filter out sequence shorter than 5 filtered_inters = {key: value for key, value in inters.items() if len(value) > 4} final_inters = {} for key, value in filtered_inters.items(): # Sort items according to time value.sort(key = lambda x: x['time']) final_inters[key] = [x['item'] for x in value] # Save interaction with open(args.save_dir + '/' + args.dataset + '/' + args.dataset + '.inters.json', 'w', encoding = 'utf-8') as f: json.dump(final_inters, f, ensure_ascii = False, indent = 4) ''' Extract user review ''' reviews = {} with open(args.review_data_path, 'r', encoding = 'utf-8') as file: # with open('/datain/v-yinju/LLMBased_Multimodal_RS/Data/Musical_Instruments.json', 'r', encoding = 'utf-8') as file: for line in file: element = json.loads(line) if len(element.get('reviewText', '')) > 0: reviews[element['reviewerID']] = {element['asin']: element['reviewText']} else: continue with open(args.save_dir + '/' + args.dataset + '/' + args.dataset + '.reviews.json', 'w', encoding = 'utf-8') as f: json.dump(reviews, f, ensure_ascii = False, indent = 4) ''' Extract item features ''' features = {} with open(args.meta_data_path, 'r', encoding = 'utf-8') as file: # with open('/datain/v-yinju/LLMBased_Multimodal_RS/Data/meta_Musical_Instruments.json', 'r', encoding = 'utf-8') as file: for line in file: element = json.loads(line) if len(element.get('title', '')) > 0 and len(element.get('description', '')) > 0 and len(element.get('imageURL', '')) > 0 and len(element.get('imageURLHighRes', '')) > 0: features[element['asin']] = { 'title': element['title'], 'description': element['description'], 'image': element['imageURL'], 'imageH': element['imageURLHighRes']} else: continue with open(args.save_dir + '/' + args.dataset + '/' + args.dataset + '.features.json', 'w', encoding = 'utf-8') as f: json.dump(features, f, ensure_ascii = False, indent = 4)