File size: 2,450 Bytes
c87c295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os, sys

sys.path.append(os.path.dirname(os.path.dirname(__file__)))

import json
import re
from typing import List, Dict

DATA_DIR = "gpt_data_gen"

B_CODE = "[CODE_START_TOK]"
E_CODE = "[/CODE_END_TOK]"

B_RESULT = "[RESULT_TOK]"
E_RESULT = "[/RESULT_TOK]"

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>", "<</SYS>>"

BOS = "<s>"
EOS = "</s>"

CODE_SYS_PROMPT_FOR_TRAIN = """
You are 'CodeLLama', an advanced Language Model assistant that can generate, execute, and evaluate code. 
Respond to user queries by providing code-based solutions and insights.
"""


def msg_to_code_result_tok_temp(msg: List[Dict]) -> str:
    full_str = f"{BOS}{B_INST} {B_SYS}\n{CODE_SYS_PROMPT_FOR_TRAIN}\n{E_SYS}\n\n"

    user_first_flag = True
    for idx, chat in enumerate(msg):
        if chat["role"] == "system":
            continue
        if chat["role"].lower() == "user":
            chat["content"] = chat["content"]
            if user_first_flag:
                full_str += f"{chat['content']} {E_INST}"
                user_first_flag = False
            else:
                full_str += f"{BOS}{B_INST}{chat['content']} {E_INST}"
        elif chat["role"] == "assistant":
            chat["content"] = chat["content"].replace(
                "/home/seungyoun/llama_code_interpreter/", "./"
            )

            # Replace the code block start and end markers using regex
            code_pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
            chat["content"] = code_pattern.sub(
                r"[CODE_START_TOK]\n\1[/CODE_END_TOK]", chat["content"]
            )

            # Replace the result block start and end markers using regex
            result_pattern = re.compile(r"```RESULTS?\n(.*?)```", re.DOTALL)
            chat["content"] = result_pattern.sub(
                r"[RESULT_TOK]\n\1[/RESULT_TOK]", chat["content"]
            )

            full_str += f"{chat['content']}{EOS}"

    full_str = full_str.replace("')()", "')")
    full_str = full_str.replace("/home/seungyoun/llama_code_interpreter/", "./")

    return full_str


def json_to_code_result_tok_temp(json_file_name: str = "425.json") -> str:
    file_rel_path = os.path.join(DATA_DIR, json_file_name)

    with open(file_rel_path, "r") as json_file:
        msg = json.load(json_file)

    full_str = msg_to_code_result_tok_temp(msg)

    return full_str


if __name__ == "__main__":
    print(json_to_code_result_tok_temp())