Spaces:
Sleeping
Sleeping
import json | |
import datetime | |
import re | |
import pandas as pd | |
import os, argparse | |
import random | |
import csv | |
from openai import OpenAI | |
from huggingface_hub import hf_hub_download | |
import json | |
import os | |
def gpt_4o_useful(input): | |
client=OpenAI(api_key=os.environ.get("OAI")) | |
response = client.chat.completions.create( | |
model="gpt-4o", | |
messages=[ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": input | |
} | |
] | |
} | |
], | |
response_format={"type": "text"}, | |
temperature=0.0000000001, | |
max_tokens=4096, | |
top_p=0, | |
frequency_penalty=0, | |
presence_penalty=0, | |
logprobs=True | |
) | |
text = response.choices[0].message.content | |
if response.choices[0].logprobs and response.choices[0].logprobs.content: | |
first_token_logprob = response.choices[0].logprobs.content[0] | |
token = first_token_logprob.token | |
logprob = first_token_logprob.logprob | |
else: | |
token = None | |
logprob = None | |
return text, token, logprob | |
def get_ICL(data, top_k=None): | |
ICL ="" | |
if top_k == None: | |
data = data | |
else: | |
# print(data) | |
data = data[:top_k] | |
for line in data: | |
# line = json.loads(line) | |
pledge = line["pledge"] | |
event = line["event_description"] | |
time = line["event_date"] | |
input=f"Pledge: {pledge}\nEvent Summary: {event} (Event Date: {time})\nIs this event summary useful?" | |
input = input.strip() | |
output = line["label"].strip() | |
ICL = f"{ICL}Input: {input}\nOutput: {output}\n\n" | |
return ICL | |
def load_json(file_path): | |
with open(file_path, 'r', encoding='utf-8') as f: | |
data = json.load(f) | |
return data | |
def gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=None): | |
if suggestion_meta: | |
# print(ICL_id) | |
train_data = [line for line in train_data if str(line.get("pledge_id")) == str(ICL_id)] | |
else: | |
random.seed(42) | |
random.shuffle(train_data) | |
ICL = get_ICL(train_data, top_k=50) | |
# print(ICL) | |
input = f"{instruction}\nBelow are examples:\n\n{ICL}Now, please assign a label for the below instance.\nInput: {test_instance}\nOutput:" | |
try: | |
text, tokens, logprobs = gpt_4o_useful(input) | |
except Exception as e: | |
print(e) | |
tokens = None | |
logprobs = None | |
return tokens, logprobs | |
def extract_columns_to_dict(file_path, delimiter='\t'): | |
data_dict = {} | |
with open(file_path, mode='r', encoding='utf-8') as file: | |
reader = csv.reader(file, delimiter=delimiter) | |
for row in reader: | |
if len(row) >= 4: | |
key = row[2] | |
value = row[3] | |
data_dict[key] = value | |
return data_dict | |
def parse_date(date_str): | |
try: | |
return datetime.datetime.strptime(date_str, "%Y-%m-%d"), date_str | |
except ValueError: | |
match = re.search(r'(.*) \(relative to (\d{4}-\d{2}-\d{2})\)', date_str) | |
if match: | |
reference = datetime.datetime.strptime(match.group(2), "%Y-%m-%d") | |
if "Last month" in match.group(1): | |
return reference - datetime.timedelta(days=30), date_str | |
elif "Yesterday" in match.group(1): | |
return reference - datetime.timedelta(days=1), date_str | |
elif "Last week" in match.group(1): | |
return reference - datetime.timedelta(days=7), date_str | |
elif "This week" in match.group(1): | |
return reference, date_str | |
# 处理不同格式的日期 | |
match = re.fullmatch(r'\d{4}', date_str) # 处理年份格式: '2014' | |
if match: | |
return datetime.datetime(int(date_str), 1, 1), date_str | |
match = re.fullmatch(r'(\w+) (\d{4})', date_str) # 处理月份+年份格式: 'November 2023' | |
if match: | |
try: | |
return datetime.datetime.strptime(date_str, "%B %Y"), date_str | |
except ValueError: | |
return None, date_str | |
match = re.fullmatch(r'(\d{4})-Q(\d)', date_str) # 处理季度格式: '2024-Q1' | |
if match: | |
year, quarter = int(match.group(1)), int(match.group(2)) | |
month = (quarter - 1) * 3 + 1 | |
return datetime.datetime(year, month, 1), date_str | |
match = re.fullmatch(r'(\d{4}) (Spring|Summer|Autumn|Fall|Winter)', date_str, re.IGNORECASE) # 处理季度名称格式: '2023 Autumn' 或 '2023 Fall' | |
if match: | |
year = int(match.group(1)) | |
season_map = {"Spring": 3, "Summer": 6, "Autumn": 9, "Fall": 9, "Winter": 12} | |
month = season_map[match.group(2).capitalize()] | |
return datetime.datetime(year, month, 1), date_str | |
return None, date_str | |
def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggestion_meta): | |
events = [] | |
# url_path = os.path.join(data_dir, "augmented_search_results.tsv") | |
# url_query_dict = extract_columns_to_dict(file_path=url_path, delimiter='\t') | |
pledge = claim.strip() | |
file_path = os.path.join(data_dir, "gpt4_event_extraction", "gpt4o_results_0_claim.json") | |
gpt4_results_json = load_json(file_path) | |
print(gpt4_results_json) | |
train_file_path = hf_hub_download( | |
repo_id="PledgeTracker/demo_feedback", | |
filename="train_useful.json", | |
repo_type="dataset", | |
token=os.environ["HF_TOKEN"] | |
) | |
with open(train_file_path, "r", encoding="utf-8") as f: | |
train_data = json.load(f) | |
print(train_data[0]) | |
instruction_path = hf_hub_download( | |
repo_id="PledgeTracker/demo_feedback", | |
filename="instruction.txt", | |
repo_type="dataset", | |
token=os.environ["HF_TOKEN"] | |
) | |
instruction = open(instruction_path, "r").read() | |
map_file_path = hf_hub_download( | |
repo_id="PledgeTracker/demo_feedback", | |
filename="mapping.txt", | |
repo_type="dataset", | |
token=os.environ["HF_TOKEN"] | |
) | |
mapping_f = open(map_file_path, "r").readlines() | |
mapping = {} | |
for map_id, line in enumerate(mapping_f): | |
mapping[map_id] = int(line.strip()) | |
ICL_id = None | |
if suggestion_meta: | |
try: | |
idx = int(suggestion_meta["index"]) | |
ICL_id = mapping.get(idx) | |
print(f"[Suggestion] index: {idx} → pledge_id: {ICL_id}") | |
except Exception as e: | |
print(f"[Mapping error]: {e}") | |
for doc in gpt4_results_json: | |
mete_date = doc["date"] | |
for event in doc.get("output", {}).get("events", []): | |
parsed_date, original_date = parse_date(event["date"]) | |
if parsed_date: | |
if mete_date!= parsed_date: | |
event_date_and_pub_date = original_date+f" ({mete_date})" | |
else: | |
event_date_and_pub_date = original_date | |
test_instance = f"Pledge: {pledge} (Speaker: {pledge_author}; Pledge Date: {pledge_date})\nEvent Summary: {event['event']} (Event Date: {original_date})\nIs this event summary useful?" | |
print(test_instance) | |
label, score = gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=ICL_id) | |
URL = doc["url"] | |
events.append({ | |
"date": original_date, | |
"event date (publication date if different)": event_date_and_pub_date, | |
"event": event["event"], | |
"url": URL, | |
"label": label, | |
"confident": score | |
}) | |
# 按时间排序 | |
events.sort(key=lambda x: parse_date(x["date"])[0], reverse=True) | |
return events | |