File size: 3,307 Bytes
8a506a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import json
import argparse
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--review_data_path', type=str, default='./data')
parser.add_argument('--meta_data_path', type=str, default='./data')
parser.add_argument('--save_dir', type=str, default='./data/dataset')
parser.add_argument('--dataset', type=str, default='dataset')
return parser.parse_args()
args = parse_args()
import os
if not os.path.exists(args.save_dir + '/' + args.dataset):
os.mkdir(args.save_dir + '/' + args.dataset)
# /datain/v-yinju/LLMBased_Multimodal_RS/Data/Musical_Instruments
''' Extract interaction sequence '''
inters = {}
with open(args.review_data_path, 'r', encoding = 'utf-8') as file:
# with open('/datain/v-yinju/LLMBased_Multimodal_RS/Data/Musical_Instruments.json', 'r', encoding = 'utf-8') as file:
for line in file:
element = json.loads(line)
if element['reviewerID'] in inters:
inters[element['reviewerID']].append({'time': element['unixReviewTime'], 'item': element['asin']})
else:
inters[element['reviewerID']] = [{'time': element['unixReviewTime'], 'item': element['asin']}]
# Filter out sequence shorter than 5
filtered_inters = {key: value for key, value in inters.items() if len(value) > 4}
final_inters = {}
for key, value in filtered_inters.items():
# Sort items according to time
value.sort(key = lambda x: x['time'])
final_inters[key] = [x['item'] for x in value]
# Save interaction
with open(args.save_dir + '/' + args.dataset + '/' + args.dataset + '.inters.json', 'w', encoding = 'utf-8') as f:
json.dump(final_inters, f, ensure_ascii = False, indent = 4)
''' Extract user review '''
reviews = {}
with open(args.review_data_path, 'r', encoding = 'utf-8') as file:
# with open('/datain/v-yinju/LLMBased_Multimodal_RS/Data/Musical_Instruments.json', 'r', encoding = 'utf-8') as file:
for line in file:
element = json.loads(line)
if len(element.get('reviewText', '')) > 0:
reviews[element['reviewerID']] = {element['asin']: element['reviewText']}
else:
continue
with open(args.save_dir + '/' + args.dataset + '/' + args.dataset + '.reviews.json', 'w', encoding = 'utf-8') as f:
json.dump(reviews, f, ensure_ascii = False, indent = 4)
''' Extract item features '''
features = {}
with open(args.meta_data_path, 'r', encoding = 'utf-8') as file:
# with open('/datain/v-yinju/LLMBased_Multimodal_RS/Data/meta_Musical_Instruments.json', 'r', encoding = 'utf-8') as file:
for line in file:
element = json.loads(line)
if len(element.get('title', '')) > 0 and len(element.get('description', '')) > 0 and len(element.get('imageURL', '')) > 0 and len(element.get('imageURLHighRes', '')) > 0:
features[element['asin']] = {
'title': element['title'],
'description': element['description'],
'image': element['imageURL'],
'imageH': element['imageURLHighRes']}
else:
continue
with open(args.save_dir + '/' + args.dataset + '/' + args.dataset + '.features.json', 'w', encoding = 'utf-8') as f:
json.dump(features, f, ensure_ascii = False, indent = 4) |