|
import json |
|
import re |
|
|
|
json_path = "/mnt/bn/vl-research/workspace/boli01/projects/sft_data_workspace/vlfeedback_80k.jsonl" |
|
|
|
with open(json_path, "r") as f: |
|
data = f.readlines() |
|
|
|
data = [json.loads(d) for d in data] |
|
|
|
|
|
def convert_format(original_data, dimension="Visual Faithfulness"): |
|
converted_data = [] |
|
for item in original_data: |
|
|
|
best_completion = max(item["completions"], key=lambda x: int(x["annotations"]["Helpfulness"]["Rating"])) |
|
best_response = best_completion["response"] |
|
best_model = best_completion["model"] |
|
|
|
if "†source" in best_response: |
|
print(best_response) |
|
|
|
pattern = r"【\d+†source】" |
|
|
|
cleaned_text = re.sub(pattern, "", best_response) |
|
best_response = cleaned_text |
|
print(f"*****************************************") |
|
print(best_response) |
|
|
|
|
|
worst_completion = min(item["completions"], key=lambda x: int(x["annotations"]["Helpfulness"]["Rating"])) |
|
worst_response = worst_completion["response"] |
|
|
|
if "†source" in worst_response: |
|
print(worst_response) |
|
|
|
pattern = r"【\d+†source】" |
|
|
|
cleaned_text = re.sub(pattern, "", worst_response) |
|
worst_response = cleaned_text |
|
print(f"*****************************************") |
|
print(worst_response) |
|
|
|
|
|
best_score = int(best_completion["annotations"][dimension]["Rating"]) |
|
worst_score = int(worst_completion["annotations"][dimension]["Rating"]) |
|
|
|
|
|
new_item = { |
|
"id": item["id"], |
|
"prompt": item["prompt"], |
|
"answer": "", |
|
"image": f"silkie_dpo/{item['id']}.jpg", |
|
"chosen": best_response, |
|
"rejected": worst_response, |
|
"chosen_score": best_score, |
|
"rejected_score": worst_score, |
|
} |
|
converted_data.append(new_item) |
|
|
|
return converted_data |
|
|
|
|
|
for dimension in ["Visual Faithfulness", "Helpfulness", "Ethical Considerations"]: |
|
converted_data = convert_format(data, dimension=dimension) |
|
with open(f"/mnt/bn/vl-research/data/llava_instruct/dpo_data/silkie_dpo_data_{dimension.replace(' ', '_').lower()}_{len(converted_data)}.json", "w") as f: |
|
json.dump(converted_data, f, indent=4) |
|
|