Spaces:

Ciallo0d00
/

GeoLLM

Runtime error

App Files Files Community

GeoLLM / eval.py

Ciallo0d00

Upload folder using huggingface_hub

badcf3c verified 5 months ago

raw

history blame contribute delete

17.5 kB

	import json
	import numpy as np
	from metrics.graph_matching import (
	get_triple_match_f1,
	get_graph_match_accuracy,
	get_bert_score,
	get_bleu_rouge,
	split_to_edges,
	get_tokens,
	get_ged
	)

	def load_data(gold_path, pred_path):
	'''
	数据加载处理：
	只评估在预测数据中出现的文本对应的三元组
	自动匹配真实数据和预测数据中的对应项
	多维度评估：
	Triple Match F1：评估三元组的精确匹配程度
	Graph Match Accuracy：评估图结构的匹配程度
	BERT Score：评估语义相似度
	BLEU & ROUGE：评估文本生成质量
	图编辑距离(GED)：评估图结构差异
	'''
	# 加载真实数据
	with open(gold_path, 'r', encoding='utf-8') as f:
	gold_data = json.load(f)

	# 加载预测数据
	with open(pred_path, 'r', encoding='utf-8') as f:
	pred_data = json.load(f)

	# 提取三元组列表
	gold_graphs = []
	pred_graphs = []

	# 确保只评估在预测数据中出现的文本对应的三元组
	for pred_item in pred_data:
	pred_text = pred_item['text']
	# 在gold_data中找到对应的文本
	for gold_item in gold_data:
	if gold_item['text'] == pred_text:
	gold_graphs.append(gold_item['triple_list'])
	pred_graphs.append(pred_item['triple_list'])
	break

	return gold_graphs, pred_graphs

	def evaluate_triples(gold_graphs, pred_graphs):
	print("开始评估...")
	print("="*50)

	# 1. Triple Match F1
	precision, recall, f1 = get_triple_match_f1(gold_graphs, pred_graphs)
	print("Triple Match")
	print(f"精确率: {precision:.4f}, 召回率: {recall:.4f}, F1: {f1:.4f}")


	# # 2. Graph Match Accuracy
	# graph_acc = get_graph_match_accuracy(pred_graphs, gold_graphs)
	# print(f"图匹配准确率: {graph_acc:.10f}")

	# 3. BERT Score
	gold_edges = split_to_edges(gold_graphs)
	pred_edges = split_to_edges(pred_graphs)
	precisions_BS, recalls_BS, f1s_BS = get_bert_score(gold_edges, pred_edges)
	print(f"BERT Score:")
	print(f"- Precision: {precisions_BS.mean():.4f}")
	print(f"- Recall: {recalls_BS.mean():.4f}")
	print(f"- F1: {f1s_BS.mean():.4f}")

	# # 4. BLEU & ROUGE
	# gold_tokens, pred_tokens = get_tokens(gold_edges, pred_edges)
	# p_rouge, r_rouge, f1_rouge, p_bleu, r_bleu, f1_bleu = get_bleu_rouge(
	# gold_tokens, pred_tokens, gold_edges, pred_edges
	# )
	# print(f"\nBLEU分数:")
	# print(f"- Precision: {p_bleu.mean():.4f}")
	# print(f"- Recall: {r_bleu.mean():.4f}")
	# print(f"- F1: {f1_bleu.mean():.4f}")

	# print(f"\nROUGE分数:")
	# print(f"- Precision: {p_rouge.mean():.4f}")
	# print(f"- Recall: {r_rouge.mean():.4f}")
	# print(f"- F1: {f1_rouge.mean():.4f}")

	# # 5. 图编辑距离(GED)
	# total_ged = 0
	# for gold, pred in zip(gold_graphs, pred_graphs):
	# ged = get_ged(gold, pred)
	# total_ged += ged
	# avg_ged = total_ged / len(gold_graphs)
	# print(f"\n平均图编辑距离: {avg_ged:.4f}")

	# 返回所有指标
	return {

	'triple_match': {
	'precision': precision,
	'recall': recall,
	'f1': f1
	},


	# 'graph_acc': graph_acc,

	'bert_score': {
	'precision': precisions_BS.mean(),
	'recall': recalls_BS.mean(),
	'f1': f1s_BS.mean()
	},
	# 'bleu': {
	# 'precision': p_bleu.mean(),
	# 'recall': r_bleu.mean(),
	# 'f1': f1_bleu.mean()
	# },
	# 'rouge': {
	# 'precision': p_rouge.mean(),
	# 'recall': r_rouge.mean(),
	# 'f1': f1_rouge.mean()
	# },
	# 'ged': avg_ged
	}

	if __name__ == '__main__':
	import pandas as pd
	# # 设置文件路径
	# gold_path = './data/train_triples.json'
	# pred_path = './output/gpt.json'

	# # 加载数据
	# gold_graphs, pred_graphs = load_data(gold_path, pred_path)

	# # 评估并打印结果
	# results = evaluate_triples(gold_graphs, pred_graphs)
	# 加载地质描述文本，提取prompt和label
	with open('./data/train_triples.json', 'r', encoding='utf-8') as f:
	data = json.load(f)
	# 将data转换为DataFrame
	df = pd.DataFrame(data)
	# 提取prompt和label
	text = df['text']
	label = df['triple_list']
	# 设置文件路径
	gold_path = './data/GT_500.json'
	model_paths = [
	# # gpt-3.5
	# 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/old/gpt-3.5-turbo.json', # 零样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/gpt-3p5-turbo.json', # 单样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/gpt-3p5-turbo.json', # 双样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/gpt-3p5-turbo.json', # 三样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/gpt-3p5-turbo.json', # KNN单样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/gpt-3p5-turbo.json', # KNN双样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/gpt-3p5-turbo.json', # KNN三样本
	# 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/gpt-3p5-turbo.json', # 知识引导单样本
	# # gpt-4o
	# 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/old/gpt-4o.json', # 零样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/gpt-4o.json', # 单样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/gpt-4o.json', # 双样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/gpt-4o.json', # 三样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/gpt-4o.json', # KNN单样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/gpt-4o.json', # KNN双样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/gpt-4o.json', # KNN三样本
	# 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/gpt-4o.json', # 知识引导单样本
	# # gemini-1p5-pro-002
	# 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/gemini-1p5-pro-002.json', # 零样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/gemini-1p5-pro-002.json', # 单样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/gemini-1p5-pro-002.json', # 双样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/gemini-1p5-pro-002.json', # 三样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/gemini-1p5-pro-002.json', # KNN单样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/gemini-1p5-pro-002.json', # KNN双样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/gemini-1p5-pro-002.json', # KNN三样本
	# 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/gemini-1p5-pro-002.json', # 知识引导单样本
	# # claude-3-5-haiku-20241022
	# 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/claude-3-5-haiku-20241022.json', # 零样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/claude-3-5-haiku-20241022.json', # 单样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/claude-3-5-haiku-20241022.json', # 双样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/claude-3-5-haiku-20241022.json', # 三样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/claude-3-5-haiku-20241022.json', # KNN单样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/claude-3-5-haiku-20241022.json', # KNN双样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/claude-3-5-haiku-20241022.json', # KNN三样本
	# 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/claude-3-5-haiku-20241022.json', # 知识引导单样本
	# # deepseek-ai
	# 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/deepseek-ai/DeepSeek-V3.json', # 零样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/deepseek-ai/DeepSeek-V3.json', # 单样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/deepseek-ai/DeepSeek-V3.json', # 双样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/deepseek-ai/DeepSeek-V3.json', # 三样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/deepseek-ai/DeepSeek-V3.json', # KNN单样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/deepseek-ai/DeepSeek-V3.json', # KNN双样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/deepseek-ai/DeepSeek-V3.json', # KNN三样本
	# 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/deepseek-ai/DeepSeek-V3.json', # 知识引导单样本

	# # R1
	# 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/deepseek-ai/DeepSeek-R1.json', # 零样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/deepseek-ai/DeepSeek-R1.json', # 单样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/deepseek-ai/DeepSeek-R1.json', # 双样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/deepseek-ai/DeepSeek-R1.json', # 三样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/deepseek-ai/DeepSeek-R1.json', # KNN单样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/deepseek-ai/DeepSeek-R1.json', # KNN双样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/deepseek-ai/DeepSeek-R1.json', # KNN三样本
	# 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/deepseek-ai/DeepSeek-R1.json', # 知识引导单样本
	# # meta-llama
	# 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json', # 零样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json', # 单样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json', # 双样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json', # 三样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json', # KNN单样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json', # KNN双样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json', # KNN三样本
	# 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json', # 知识引导单样本
	# # Qwen
	# 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/Qwen/Qwen2p5-72B-Instruct.json', # 零样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/Qwen/Qwen2p5-72B-Instruct.json', # 单样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/Qwen/Qwen2p5-72B-Instruct.json', # 双样本
	# 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/Qwen/Qwen2p5-72B-Instruct.json', # 三样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/Qwen/Qwen2p5-72B-Instruct.json', # KNN单样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/Qwen/Qwen2p5-72B-Instruct.json', # KNN双样本
	# 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/Qwen/Qwen2p5-72B-Instruct.json', # KNN三样本
	# 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/Qwen/Qwen2p5-72B-Instruct.json', # 知识引导单样本

	# 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/gpt-3.5-turbo_one_shot.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/gpt-3.5-turbo_0407.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/gpt-3.5-turbo_old.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/gpt-4o_konwledge_tri.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/gemini-1.5-pro-002_one_shot.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/claude-3-5-haiku-20241022_one_shot.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/deepseek-ai/DeepSeek-V3_0420.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_only_tri/deepseek-ai/DeepSeek-V3.json', # 知识引导单样本
	'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1_guide.json', # 知识引导单样本
	'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1_one_shot.json', # 知识引导单样本
	'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_only_tri/deepseek-ai/DeepSeek-R1.json'
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_one_shot.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/Qwen/Qwen2.5-72B-Instruct_one_shot.json', # 知识引导单样本

	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gpt-3.5-turbo_one_shot.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gpt-4o_one_shot.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gemini-1.5-pro-002_one_shot.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/claude-3-5-haiku-20241022_one_shot.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-V3_one_shot.json', # 知识引导单样本
	# # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1.json', # 知识引导单样本
	# # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1_one_shot.json'
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/meta-llama/Meta-Llama-3.1-405B-Instruct_one_shot.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/Qwen/Qwen2.5-72B-Instruct_one_shot.json', # 知识引导单样本

	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gpt-3.5-turbo.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gpt-4o.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gemini-1.5-pro-002.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/claude-3-5-haiku-20241022.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-V3.json', # 知识引导单样本
	# # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1.json', # 知识引导单样本
	# # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1_one_shot.json'
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/meta-llama/Meta-Llama-3.1-405B-Instruct.json', # 知识引导单样本
	# 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/Qwen/Qwen2.5-72B-Instruct.json', # 知识引导单样本
	]
	# 对比不同模型的表现
	print("各模型评估结果:")
	# 存储所有模型的结果
	all_results = {}

	for pred_path in model_paths:
	model_name = pred_path.split('/')[-1].split('.')[0]
	print(f"\n{model_name}模型:")

	# 加载数据
	gold_graphs, pred_graphs = load_data(gold_path, pred_path)

	# 评估并打印结果
	results = evaluate_triples(gold_graphs, pred_graphs)
	all_results[model_name] = results
	# 连带model_paths和results一起保存为txt
	save_path = 'F:/GeoLLM/output/output_result/Task1/Result_Task1.txt'
	with open(save_path, 'a', encoding='utf-8') as f:
	f.write(f"{pred_path}: \n")
	f.write(f"Triple Match: \n")
	f.write(f"精确率: {results['triple_match']['precision']:.4f}, 召回率: {results['triple_match']['recall']:.4f}, F1: {results['triple_match']['f1']:.4f}\n")
	f.write(f"BERT Score: \n")
	f.write(f"- Precision: {results['bert_score']['precision']:.4f}\n")
	f.write(f"- Recall: {results['bert_score']['recall']:.4f}\n")
	f.write(f"- F1: {results['bert_score']['f1']:.4f}\n\n")