|
import re |
|
import json |
|
|
|
|
|
with open("./datasets/glaive/glaive-function-calling-v2.json", "r") as f: |
|
data = json.load(f) |
|
|
|
|
|
def process_sample(sample): |
|
system_text = sample.get("system", "") |
|
chat_text = sample.get("chat", "") |
|
|
|
|
|
tools_match = re.findall( |
|
r'"name": "(.*?)",.*?"description": "(.*?)"', system_text, re.S |
|
) |
|
if not tools_match: |
|
return None |
|
tools = [{"name": name, "description": desc} for name, desc in tools_match] |
|
|
|
|
|
function_calls = re.findall(r'<functioncall> {"name": "(.*?)"', chat_text) |
|
if not function_calls: |
|
return None |
|
|
|
|
|
assistant_responses = re.findall( |
|
r"ASSISTANT: (.*?)<\|endoftext\|>", chat_text, re.S |
|
) |
|
instructions = [] |
|
for call, response in zip(function_calls, assistant_responses): |
|
|
|
user_prompt_match = re.search( |
|
rf'USER: (.*?)\n.*?<functioncall> {{.*?"{call}".*?}}', chat_text, re.S |
|
) |
|
if user_prompt_match: |
|
instructions.append( |
|
{ |
|
"instruction": user_prompt_match.group(1).strip(), |
|
"tools": tools, |
|
"used_tools": [call], |
|
} |
|
) |
|
|
|
return instructions |
|
|
|
|
|
from tqdm import tqdm |
|
|
|
results = [] |
|
for sample in tqdm(data, desc="Processing GLAIVE samples"): |
|
processed = process_sample(sample) |
|
if processed: |
|
results.extend(processed) |
|
|
|
with open("./datasets/glaive/output.json", "w") as f: |
|
json.dump(results, f, indent=2) |
|
|