Spaces:
Sleeping
Sleeping
XufengDuan
commited on
Commit
β’
3f72150
1
Parent(s):
fb7f810
update scripts
Browse files- .DS_Store +0 -0
- src/backend/model_operations.py +45 -24
- src/backend/util.py +1 -1
- src/envs.py +1 -1
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
src/backend/model_operations.py
CHANGED
@@ -43,7 +43,7 @@ logging.basicConfig(level=logging.INFO,
|
|
43 |
# Load spacy model for word tokenization
|
44 |
nlp = spacy.load("en_core_web_sm")
|
45 |
nlp1 = spacy.load("en_core_web_trf")
|
46 |
-
os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
47 |
|
48 |
def load_evaluation_model(model_path):
|
49 |
"""Load the evaluation model from the given path
|
@@ -173,7 +173,7 @@ class SummaryGenerator:
|
|
173 |
# print(ID, q_ID, prompt_value)
|
174 |
system_prompt = envs.SYSTEM_PROMPT
|
175 |
_user_prompt = prompt_value
|
176 |
-
for ii in range(
|
177 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
178 |
while True:
|
179 |
try:
|
@@ -405,20 +405,27 @@ class SummaryGenerator:
|
|
405 |
# max_tokens=1024,
|
406 |
# api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
|
407 |
# )
|
408 |
-
self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
|
409 |
-
response = litellm.completion(
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
print("樑εθΏεη»ζ",response)
|
418 |
-
print("樑εθΏεη»ζη»ζ")
|
419 |
-
# exit()
|
420 |
-
result = response['choices'][0]['message']['content']
|
421 |
-
print(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
return result
|
423 |
# exit()
|
424 |
except: # fail to call api. run it locally.
|
@@ -544,6 +551,11 @@ class EvaluationModel:
|
|
544 |
output.append("Other")
|
545 |
continue
|
546 |
rs = summaries_df["Response"][i].strip().lower()
|
|
|
|
|
|
|
|
|
|
|
547 |
'''Exp1'''
|
548 |
if summaries_df["Experiment"][i] == "E1":
|
549 |
print("E1", rs)
|
@@ -864,13 +876,22 @@ class EvaluationModel:
|
|
864 |
output.append("Other")
|
865 |
|
866 |
'''Exp4'''
|
867 |
-
|
868 |
elif summaries_df["Experiment"][i] == "E4":
|
869 |
-
|
870 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
871 |
target = summaries_df["Factor 2"][i].strip().lower()
|
872 |
pair = target + "_" + meaning_word
|
873 |
print("E4:", pair)
|
|
|
874 |
if pair in wordpair2code.keys():
|
875 |
output.append(wordpair2code[pair])
|
876 |
else:
|
@@ -1068,7 +1089,7 @@ class EvaluationModel:
|
|
1068 |
float: The average JS divergence across all common Question_IDs.
|
1069 |
"""
|
1070 |
# Load the datasets
|
1071 |
-
human_df = pd.
|
1072 |
llm_df = pd.read_csv(file_path_2)
|
1073 |
|
1074 |
def create_e5_entries(df):
|
@@ -1146,7 +1167,7 @@ class EvaluationModel:
|
|
1146 |
return avg_js_divergence
|
1147 |
|
1148 |
|
1149 |
-
def evaluate_humanlike(self, summaries_df, human_data_path, result_save_path):
|
1150 |
'''
|
1151 |
evaluate humanlike score
|
1152 |
1. code the result
|
@@ -1156,8 +1177,8 @@ class EvaluationModel:
|
|
1156 |
'''coding human data'''
|
1157 |
# self.huamn_df = pd.read_csv(human_data_path)
|
1158 |
# self.data = self.code_results(self.huamn_df)
|
1159 |
-
save_path = human_data_path.replace('.csv','_coding.csv')
|
1160 |
-
human_save_path = "./src/datasets/coding_human.xlsx"
|
1161 |
# if save_path is not None:
|
1162 |
# print(f'Save human coding results to {save_path}')
|
1163 |
# fpath = Path(save_path)
|
@@ -1175,7 +1196,7 @@ class EvaluationModel:
|
|
1175 |
self.llm_df.to_csv(fpath)
|
1176 |
# file_path_1 = '/Users/simon/Downloads/coding_human.xlsx'
|
1177 |
# file_path_2 = '/Users/simon/Downloads/Meta-Llama-3.1-70B-Instruct_coding.csv'
|
1178 |
-
avg_js_divergence = self.calculate_js_divergence(
|
1179 |
|
1180 |
return avg_js_divergence
|
1181 |
|
|
|
43 |
# Load spacy model for word tokenization
|
44 |
nlp = spacy.load("en_core_web_sm")
|
45 |
nlp1 = spacy.load("en_core_web_trf")
|
46 |
+
# os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
47 |
|
48 |
def load_evaluation_model(model_path):
|
49 |
"""Load the evaluation model from the given path
|
|
|
173 |
# print(ID, q_ID, prompt_value)
|
174 |
system_prompt = envs.SYSTEM_PROMPT
|
175 |
_user_prompt = prompt_value
|
176 |
+
for ii in range(1):
|
177 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
178 |
while True:
|
179 |
try:
|
|
|
405 |
# max_tokens=1024,
|
406 |
# api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
|
407 |
# )
|
408 |
+
# self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
|
409 |
+
# response = litellm.completion(
|
410 |
+
# model="huggingface/" + self.model_id,
|
411 |
+
# # mistralai/Mistral-7B-Instruct-v0.1",
|
412 |
+
# messages=[{"role": "system", "content": system_prompt},
|
413 |
+
# {"role": "user", "content": user_prompt}],
|
414 |
+
# #temperature=0.0,
|
415 |
+
# max_tokens=1024,
|
416 |
+
# api_base="https://api-inference.huggingface.co/models/" + self.model_id)
|
417 |
+
# print("樑εθΏεη»ζ",response)
|
418 |
+
# print("樑εθΏεη»ζη»ζ")
|
419 |
+
# # exit()
|
420 |
+
# result = response['choices'][0]['message']['content']
|
421 |
+
# print(result)
|
422 |
+
from huggingface_hub import InferenceClient
|
423 |
+
|
424 |
+
client = InferenceClient(self.model_id,api_key=envs.TOKEN)
|
425 |
+
messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
|
426 |
+
outputs = client.chat_completion(messages, max_tokens=50)
|
427 |
+
result = outputs['choices'][0]['message']['content']
|
428 |
+
|
429 |
return result
|
430 |
# exit()
|
431 |
except: # fail to call api. run it locally.
|
|
|
551 |
output.append("Other")
|
552 |
continue
|
553 |
rs = summaries_df["Response"][i].strip().lower()
|
554 |
+
sentences = rs.split('\n')
|
555 |
+
sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence
|
556 |
+
for sentence in sentences]
|
557 |
+
rs = [sentence.strip() for sentence in sentences if sentence.strip()]
|
558 |
+
|
559 |
'''Exp1'''
|
560 |
if summaries_df["Experiment"][i] == "E1":
|
561 |
print("E1", rs)
|
|
|
876 |
output.append("Other")
|
877 |
|
878 |
'''Exp4'''
|
879 |
+
|
880 |
elif summaries_df["Experiment"][i] == "E4":
|
881 |
+
try:
|
882 |
+
meaning_word = rs.split(";")[4].replace(" ", '')
|
883 |
+
except IndexError:
|
884 |
+
output.append("Other")
|
885 |
+
continue
|
886 |
+
except Exception as e:
|
887 |
+
print(f"Unexpected error: {e}")
|
888 |
+
output.append("Other")
|
889 |
+
continue
|
890 |
+
|
891 |
target = summaries_df["Factor 2"][i].strip().lower()
|
892 |
pair = target + "_" + meaning_word
|
893 |
print("E4:", pair)
|
894 |
+
|
895 |
if pair in wordpair2code.keys():
|
896 |
output.append(wordpair2code[pair])
|
897 |
else:
|
|
|
1089 |
float: The average JS divergence across all common Question_IDs.
|
1090 |
"""
|
1091 |
# Load the datasets
|
1092 |
+
human_df = pd.read_csv(file_path_1, encoding='ISO-8859-1')
|
1093 |
llm_df = pd.read_csv(file_path_2)
|
1094 |
|
1095 |
def create_e5_entries(df):
|
|
|
1167 |
return avg_js_divergence
|
1168 |
|
1169 |
|
1170 |
+
def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
|
1171 |
'''
|
1172 |
evaluate humanlike score
|
1173 |
1. code the result
|
|
|
1177 |
'''coding human data'''
|
1178 |
# self.huamn_df = pd.read_csv(human_data_path)
|
1179 |
# self.data = self.code_results(self.huamn_df)
|
1180 |
+
#save_path = human_data_path.replace('.csv','_coding.csv')
|
1181 |
+
#human_save_path = "./src/datasets/coding_human.xlsx"
|
1182 |
# if save_path is not None:
|
1183 |
# print(f'Save human coding results to {save_path}')
|
1184 |
# fpath = Path(save_path)
|
|
|
1196 |
self.llm_df.to_csv(fpath)
|
1197 |
# file_path_1 = '/Users/simon/Downloads/coding_human.xlsx'
|
1198 |
# file_path_2 = '/Users/simon/Downloads/Meta-Llama-3.1-70B-Instruct_coding.csv'
|
1199 |
+
avg_js_divergence = self.calculate_js_divergence(human_data_path, save_path)
|
1200 |
|
1201 |
return avg_js_divergence
|
1202 |
|
src/backend/util.py
CHANGED
@@ -61,7 +61,7 @@ def format_results(model_name: str, revision: str, precision: str,
|
|
61 |
},
|
62 |
"results": {
|
63 |
"hallucination_rate": {
|
64 |
-
"hallucination_rate": round(hallucination_rate,
|
65 |
},
|
66 |
"factual_consistency_rate": {
|
67 |
"factual_consistency_rate": round(factual_consistency_rate,1)
|
|
|
61 |
},
|
62 |
"results": {
|
63 |
"hallucination_rate": {
|
64 |
+
"hallucination_rate": round(hallucination_rate,3)
|
65 |
},
|
66 |
"factual_consistency_rate": {
|
67 |
"factual_consistency_rate": round(factual_consistency_rate,1)
|
src/envs.py
CHANGED
@@ -36,7 +36,7 @@ API = HfApi(token=TOKEN)
|
|
36 |
DATASET_PATH = "./src/datasets/Material_Llama2_0603.xlsx" #experiment data
|
37 |
PROMPT_PATH = "./src/datasets/prompt.xlsx" #prompt for each experiment
|
38 |
HEM_PATH = 'vectara/hallucination_evaluation_model'
|
39 |
-
HUMAN_DATA = "./src/datasets/
|
40 |
ITEM_4_DATA = "./src/datasets/associataion_dataset.csv" #database
|
41 |
ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
|
42 |
|
|
|
36 |
DATASET_PATH = "./src/datasets/Material_Llama2_0603.xlsx" #experiment data
|
37 |
PROMPT_PATH = "./src/datasets/prompt.xlsx" #prompt for each experiment
|
38 |
HEM_PATH = 'vectara/hallucination_evaluation_model'
|
39 |
+
HUMAN_DATA = "./src/datasets/human_data_coding.csv" #experiment data
|
40 |
ITEM_4_DATA = "./src/datasets/associataion_dataset.csv" #database
|
41 |
ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
|
42 |
|