llava-uhd-new / scripts /archived /dpo_data_info.py

Add files using upload-large-folder tool

ca7c7f9 verified about 2 months ago

2.85 kB

	import json
	import re

	json_path = "/mnt/bn/vl-research/workspace/boli01/projects/sft_data_workspace/vlfeedback_80k.jsonl"

	with open(json_path, "r") as f:
	data = f.readlines()

	data = [json.loads(d) for d in data]


	def convert_format(original_data, dimension="Visual Faithfulness"):
	converted_data = []
	for item in original_data:
	# Assuming the best response is the one with the highest helpfulness rating
	best_completion = max(item["completions"], key=lambda x: int(x["annotations"]["Helpfulness"]["Rating"]))
	best_response = best_completion["response"]
	best_model = best_completion["model"]

	if "†source" in best_response:
	print(best_response)
	# Regex pattern to match the pattern 【digit†source】
	pattern = r"【\d+†source】"
	# Replace the matched patterns with an empty string
	cleaned_text = re.sub(pattern, "", best_response)
	best_response = cleaned_text
	print(f"*****************************************")
	print(best_response)

	# Assuming the worst response is the one with the lowest helpfulness rating
	worst_completion = min(item["completions"], key=lambda x: int(x["annotations"]["Helpfulness"]["Rating"]))
	worst_response = worst_completion["response"]

	if "†source" in worst_response:
	print(worst_response)
	# Regex pattern to match the pattern ��digit†source】
	pattern = r"【\d+†source】"
	# Replace the matched patterns with an empty string
	cleaned_text = re.sub(pattern, "", worst_response)
	worst_response = cleaned_text
	print(f"*****************************************")
	print(worst_response)

	# Extract scores
	best_score = int(best_completion["annotations"][dimension]["Rating"])
	worst_score = int(worst_completion["annotations"][dimension]["Rating"])

	# Construct the new format
	new_item = {
	"id": item["id"],
	"prompt": item["prompt"],
	"answer": "",
	"image": f"silkie_dpo/{item['id']}.jpg", # Assuming the video ID is the last part of the original ID
	"chosen": best_response,
	"rejected": worst_response,
	"chosen_score": best_score,
	"rejected_score": worst_score,
	}
	converted_data.append(new_item)

	return converted_data


	for dimension in ["Visual Faithfulness", "Helpfulness", "Ethical Considerations"]:
	converted_data = convert_format(data, dimension=dimension)
	with open(f"/mnt/bn/vl-research/data/llava_instruct/dpo_data/silkie_dpo_data_{dimension.replace(' ', '_').lower()}_{len(converted_data)}.json", "w") as f:
	json.dump(converted_data, f, indent=4)