Spaces:
Runtime error
Runtime error
Commit
·
d54fa91
1
Parent(s):
d197237
finished v1
Browse files- app.py +43 -0
- src/agent.py +0 -0
- src/core.py +64 -0
- src/data/output/table_mapping.csv +9 -0
- src/data/synthetic/legal_entries_a.csv +101 -0
- src/data/synthetic/legal_entries_b.csv +101 -0
- src/data/synthetic/legal_template.csv +101 -0
- src/notebooks/table_mapping +0 -9
- src/prompt.py +94 -0
- src/types.py +12 -0
- src/vars.py +1 -0
app.py
CHANGED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from src.types import TableMapping
|
4 |
+
from src.core import get_dataframes, get_table_mapping, save_csv_file, sanitize_python_output
|
5 |
+
|
6 |
+
source_df, template_df = get_dataframes()
|
7 |
+
|
8 |
+
|
9 |
+
with gr.Blocks() as demo:
|
10 |
+
with gr.Column():
|
11 |
+
gr.Markdown("## To begin, upload a Template CSV and a Source CSV file.")
|
12 |
+
with gr.Row():
|
13 |
+
gr.inputs.File(label="Template", type="file", file_count='single')
|
14 |
+
gr.inputs.File(label="Source", type="file", file_count='single')
|
15 |
+
|
16 |
+
with gr.Column():
|
17 |
+
gr.Markdown("## Mapping from Source to Template")
|
18 |
+
with gr.Row():
|
19 |
+
table_mapping: TableMapping = get_table_mapping(source_df, template_df)
|
20 |
+
table_mapping_df = pd.DataFrame(table_mapping.dict()['table_mappings'])
|
21 |
+
gr.DataFrame(value=table_mapping_df)
|
22 |
+
save_mapping_btn = gr.Button(value="Save Mapping", variant="secondary")
|
23 |
+
save_mapping_btn.click(fn=lambda : save_csv_file(table_mapping_df, 'table_mapping'))
|
24 |
+
|
25 |
+
with gr.Row():
|
26 |
+
generate_code_btn = gr.Button(value="Generate Code from Mapping", variant="primary")
|
27 |
+
# generate_code_btn.click(fn=generate_code, outputs=test)
|
28 |
+
|
29 |
+
# with gr.Column():
|
30 |
+
# gr.Markdown("## Here is the code that will be used to transform the source file into the template schema:")
|
31 |
+
# gr.Code(language="python", value=sanitize_python_output(transform_code))
|
32 |
+
|
33 |
+
# with gr.Row():
|
34 |
+
# gr.Button(value="Transform Source", variant="primary", trigger="transform_source")
|
35 |
+
# gr.Button(value="Save Code", variant="secondary", trigger="save_code")
|
36 |
+
|
37 |
+
# with gr.Row():
|
38 |
+
# with gr.Column():
|
39 |
+
# gr.Dataframe(label='Target (template)', type='pandas', value=template_df)
|
40 |
+
# with gr.Column():
|
41 |
+
# gr.Dataframe(label='Source (transformed)', type='pandas', value=PythonAstREPLTool(locals={'source_df': table_1_df}).run(transform_code))
|
42 |
+
|
43 |
+
demo.launch()
|
src/agent.py
DELETED
File without changes
|
src/core.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
from langchain.output_parsers import PydanticOutputParser
|
4 |
+
from langchain.prompts import ChatPromptTemplate
|
5 |
+
from langchain.tools import PythonAstREPLTool
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
from langchain.schema.output_parser import StrOutputParser
|
8 |
+
from langchain.chat_models import ChatOpenAI
|
9 |
+
from src.types import TableMapping
|
10 |
+
from src.vars import NUM_ROWS_TO_RETURN
|
11 |
+
from src.prompt import DATA_SCIENTIST_PROMPT_STR, SPEC_WRITER_PROMPT_STR, ENGINEER_PROMPT_STR
|
12 |
+
|
13 |
+
os.environ["OPENAI_API_KEY"] = "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"
|
14 |
+
|
15 |
+
DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), 'data')
|
16 |
+
SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, 'synthetic')
|
17 |
+
|
18 |
+
TRANSFORM_MODEL = ChatOpenAI(
|
19 |
+
model_name='gpt-4',
|
20 |
+
temperature=0,
|
21 |
+
)
|
22 |
+
|
23 |
+
natural_language_model = ChatOpenAI(
|
24 |
+
model_name='gpt-4',
|
25 |
+
temperature=0.1,
|
26 |
+
)
|
27 |
+
|
28 |
+
def get_dataframes():
|
29 |
+
source = pd.read_csv(os.path.join(SYNTHETIC_DATA_DIR_PATH, 'legal_entries_a.csv'))
|
30 |
+
template = pd.read_csv(os.path.join(SYNTHETIC_DATA_DIR_PATH, 'legal_template.csv'))
|
31 |
+
return source, template
|
32 |
+
|
33 |
+
def get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
|
34 |
+
return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
|
35 |
+
|
36 |
+
def get_table_mapping(source_df, template_df) -> TableMapping:
|
37 |
+
table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
|
38 |
+
analyst_prompt = ChatPromptTemplate.from_template(
|
39 |
+
template=DATA_SCIENTIST_PROMPT_STR,
|
40 |
+
partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},
|
41 |
+
)
|
42 |
+
|
43 |
+
mapping_chain = analyst_prompt | TRANSFORM_MODEL | table_mapping_parser
|
44 |
+
return mapping_chain.invoke({"source_1_csv_str": get_data_str_from_df_for_prompt(source_df), "target_csv_str": get_data_str_from_df_for_prompt(template_df)})
|
45 |
+
|
46 |
+
|
47 |
+
def get_code_spec(table_mapping: TableMapping) -> str:
|
48 |
+
writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
|
49 |
+
writer_chain = writer_prompt | natural_language_model | StrOutputParser()
|
50 |
+
return writer_chain.invoke({"table_mapping": str(table_mapping)})
|
51 |
+
|
52 |
+
|
53 |
+
def get_mapping_code(spec_str: str) -> str:
|
54 |
+
engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
|
55 |
+
engineer_chain = engineer_prompt | TRANSFORM_MODEL | StrOutputParser()
|
56 |
+
return engineer_chain.invoke({"spec_str": spec_str})
|
57 |
+
|
58 |
+
|
59 |
+
def sanitize_python_output(text: str):
|
60 |
+
_, after = text.split("```python")
|
61 |
+
return after.split("```")[0]
|
62 |
+
|
63 |
+
def save_csv_file(df, filename):
|
64 |
+
df.to_csv(os.path.join(DATA_DIR_PATH, 'output', filename) + '.csv')
|
src/data/output/table_mapping.csv
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,source_column_name,target_column_name,value_transformations,explanation
|
2 |
+
0,case_date,CaseDate,NO_TRANSFORM,The 'case_date' column in the source directly maps to the 'CaseDate' column in the target with no transformation needed.
|
3 |
+
1,lastname,FullName,"Concatenate 'lastname' and 'firstname' from source, with a space in between, and 'firstname' preceding 'lastname'.","The 'lastname' and 'firstname' columns in the source are combined to form the 'FullName' column in the target, with 'firstname' coming first."
|
4 |
+
2,firstname,FullName,"Concatenate 'firstname' and 'lastname' from source, with a space in between, and 'firstname' preceding 'lastname'.","The 'firstname' and 'lastname' columns in the source are combined to form the 'FullName' column in the target, with 'firstname' coming first."
|
5 |
+
3,case_type,CaseType,NO_TRANSFORM,The 'case_type' column in the source directly maps to the 'CaseType' column in the target with no transformation needed.
|
6 |
+
4,case_id,CaseID,Replace 'CR-' prefix in source with 'CASE-' in target.,"The 'case_id' column in the source maps to the 'CaseID' column in the target, with the 'CR-' prefix replaced by 'CASE-'."
|
7 |
+
5,court_fee,Fee,NO_TRANSFORM,The 'court_fee' column in the source directly maps to the 'Fee' column in the target with no transformation needed.
|
8 |
+
6,jurisdiction,Jurisdiction,Capitalize the first letter of each word in the source.,"The 'jurisdiction' column in the source maps to the 'Jurisdiction' column in the target, with the first letter of each word capitalized."
|
9 |
+
7,judge_last_name,NO_TARGET,NO_TRANSFORM,The 'judge_last_name' column in the source does not have a corresponding column in the target and can be ignored.
|
src/data/synthetic/legal_entries_a.csv
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
case_date,lastname,firstname,case_type,case_id,court_fee,jurisdiction,judge_last_name
|
2 |
+
2023-01-16,Okafor,Jane,Civil,CR-6190,250,BOSTON,Connor
|
3 |
+
2023-08-10,Malcolm,Elena,Civil,CR-3092,100,chicago,James
|
4 |
+
2023-06-14,Nasser,Alan,Civil,CR-5947,150,BOSTON,Skaarsgard
|
5 |
+
2023-07-17,Smith,Miguel,Family,CR-7727,250,LOS angeles,Brown
|
6 |
+
2023-07-25,Kim,John,Criminal,CR-4120,150,BOSTON,Skaarsgard
|
7 |
+
2023-07-14,Brown,John,Civil,CR-8850,100,LOS angeles,Brown
|
8 |
+
2023-01-19,Nasser,Dmitri,Criminal,CR-2308,100,chicago,Connor
|
9 |
+
2023-02-26,Rodriguez,Alan,Criminal,CR-4477,100,chicago,Morgan
|
10 |
+
2023-02-10,Brown,Alice,Criminal,CR-9490,200,chicago,Morgan
|
11 |
+
2023-09-12,Smith,Nadia,Family,CR-4111,100,LOS angeles,Skaarsgard
|
12 |
+
2023-02-25,Kim,Chen,Criminal,CR-9421,150,BOSTON,Oleg
|
13 |
+
2023-09-15,Kim,John,Family,CR-3270,200,houston,Morgan
|
14 |
+
2023-07-22,Patel,Nadia,Family,CR-1501,200,houston,Skaarsgard
|
15 |
+
2023-01-27,Lee,Lakshmi,Family,CR-8321,150,houston,Brown
|
16 |
+
2023-01-14,Brown,John,Family,CR-2748,100,LOS angeles,James
|
17 |
+
2023-07-13,Malcolm,Miguel,Family,CR-3163,100,LOS angeles,Skaarsgard
|
18 |
+
2023-02-26,Smith,Alice,Civil,CR-4296,150,BOSTON,James
|
19 |
+
2023-09-25,Patel,Terrance,Criminal,CR-2230,200,houston,Morgan
|
20 |
+
2023-02-13,Ivanov,Alan,Family,CR-9353,100,new York,Morgan
|
21 |
+
2023-04-18,Chatterjee,Alice,Civil,CR-8786,100,chicago,Skaarsgard
|
22 |
+
2023-09-11,Brown,Jane,Criminal,CR-6001,100,LOS angeles,Connor
|
23 |
+
2023-02-16,Okafor,Jane,Criminal,CR-9434,250,BOSTON,Oleg
|
24 |
+
2023-07-22,Chatterjee,Dmitri,Criminal,CR-1042,100,BOSTON,Brown
|
25 |
+
2023-08-28,Smith,Miguel,Family,CR-1427,150,LOS angeles,Brown
|
26 |
+
2023-06-14,Johnson,Miguel,Civil,CR-7553,200,chicago,Skaarsgard
|
27 |
+
2023-02-24,Ivanov,Chen,Civil,CR-2242,250,LOS angeles,Connor
|
28 |
+
2023-06-23,Rodriguez,Terrance,Criminal,CR-6940,250,houston,Connor
|
29 |
+
2023-01-10,Johnson,Elena,Civil,CR-4064,150,houston,Oleg
|
30 |
+
2023-01-15,Patel,Chen,Civil,CR-3129,100,new York,Morgan
|
31 |
+
2023-08-16,Malcolm,Oluwaseun,Civil,CR-2758,150,BOSTON,Connor
|
32 |
+
2023-02-24,Ivanov,Lakshmi,Criminal,CR-9562,250,BOSTON,Brown
|
33 |
+
2023-05-15,Okafor,Terrance,Criminal,CR-2292,250,BOSTON,Morgan
|
34 |
+
2023-06-26,Patel,Jane,Criminal,CR-7889,250,LOS angeles,Brown
|
35 |
+
2023-02-14,Rodriguez,John,Family,CR-5178,150,houston,Morgan
|
36 |
+
2023-05-15,Patel,Terrance,Civil,CR-5004,150,houston,Morgan
|
37 |
+
2023-03-19,Johnson,Alice,Family,CR-2883,200,new York,Morgan
|
38 |
+
2023-02-12,Rodriguez,Alan,Family,CR-4416,200,BOSTON,Brown
|
39 |
+
2023-07-25,Malcolm,Chen,Civil,CR-9332,200,houston,Morgan
|
40 |
+
2023-09-15,Chatterjee,Miguel,Civil,CR-7699,250,BOSTON,Skaarsgard
|
41 |
+
2023-03-13,Lee,Nadia,Civil,CR-7258,100,new York,James
|
42 |
+
2023-05-27,Brown,Nadia,Civil,CR-7490,200,houston,Morgan
|
43 |
+
2023-02-22,Johnson,Alice,Civil,CR-8231,100,chicago,Connor
|
44 |
+
2023-03-18,Malcolm,Nadia,Criminal,CR-2720,100,new York,Oleg
|
45 |
+
2023-06-11,Brown,Nadia,Criminal,CR-4277,100,BOSTON,Connor
|
46 |
+
2023-02-22,Okafor,Oluwaseun,Criminal,CR-9738,100,new York,Skaarsgard
|
47 |
+
2023-08-19,Patel,Jane,Civil,CR-2452,250,BOSTON,Skaarsgard
|
48 |
+
2023-09-27,Lee,Alan,Family,CR-1899,100,new York,Skaarsgard
|
49 |
+
2023-04-21,Malcolm,Dmitri,Family,CR-8404,150,LOS angeles,Brown
|
50 |
+
2023-03-10,Chatterjee,Alice,Family,CR-4240,100,LOS angeles,Morgan
|
51 |
+
2023-05-13,Kim,Elena,Family,CR-6153,250,chicago,Brown
|
52 |
+
2023-09-10,Patel,Alan,Criminal,CR-3485,200,chicago,Oleg
|
53 |
+
2023-08-18,Kim,Lakshmi,Criminal,CR-5520,200,LOS angeles,James
|
54 |
+
2023-02-21,Patel,Alan,Criminal,CR-9879,250,LOS angeles,Brown
|
55 |
+
2023-05-12,Brown,Jane,Criminal,CR-5259,200,new York,James
|
56 |
+
2023-01-20,Patel,Oluwaseun,Criminal,CR-8333,100,BOSTON,James
|
57 |
+
2023-01-23,Nasser,Chen,Civil,CR-2711,200,LOS angeles,Morgan
|
58 |
+
2023-03-12,Brown,Miguel,Family,CR-5100,100,LOS angeles,Morgan
|
59 |
+
2023-01-15,Rodriguez,Terrance,Criminal,CR-4849,100,LOS angeles,Morgan
|
60 |
+
2023-05-17,Lee,Jane,Criminal,CR-8058,150,new York,Skaarsgard
|
61 |
+
2023-04-18,Okafor,Chen,Civil,CR-9076,100,new York,Brown
|
62 |
+
2023-02-22,Chatterjee,Lakshmi,Criminal,CR-5230,200,BOSTON,Morgan
|
63 |
+
2023-08-18,Brown,John,Criminal,CR-7094,200,LOS angeles,Connor
|
64 |
+
2023-08-17,Lee,Oluwaseun,Civil,CR-8915,150,BOSTON,Oleg
|
65 |
+
2023-08-18,Malcolm,Alan,Family,CR-9030,100,chicago,Brown
|
66 |
+
2023-02-13,Malcolm,Chen,Criminal,CR-1482,150,houston,Morgan
|
67 |
+
2023-02-16,Brown,John,Criminal,CR-3535,100,BOSTON,Morgan
|
68 |
+
2023-08-20,Johnson,Chen,Criminal,CR-2029,250,houston,James
|
69 |
+
2023-01-10,Kim,Alan,Civil,CR-1812,250,houston,Oleg
|
70 |
+
2023-02-18,Chatterjee,Alice,Civil,CR-5295,150,chicago,Oleg
|
71 |
+
2023-08-25,Lee,Miguel,Criminal,CR-6850,150,LOS angeles,Brown
|
72 |
+
2023-05-12,Malcolm,Alan,Criminal,CR-7973,150,BOSTON,Connor
|
73 |
+
2023-05-19,Johnson,Chen,Family,CR-5221,200,houston,Skaarsgard
|
74 |
+
2023-06-17,Okafor,John,Criminal,CR-4117,250,BOSTON,Oleg
|
75 |
+
2023-03-18,Patel,Elena,Family,CR-2368,100,houston,Skaarsgard
|
76 |
+
2023-06-22,Rodriguez,Lakshmi,Family,CR-8384,200,new York,Oleg
|
77 |
+
2023-07-14,Smith,Miguel,Civil,CR-4476,100,new York,Morgan
|
78 |
+
2023-03-26,Brown,Chen,Civil,CR-4545,100,houston,Morgan
|
79 |
+
2023-06-22,Chatterjee,Dmitri,Civil,CR-4421,250,houston,Skaarsgard
|
80 |
+
2023-03-20,Patel,Miguel,Criminal,CR-6559,150,new York,Morgan
|
81 |
+
2023-07-11,Kim,Oluwaseun,Civil,CR-1803,250,BOSTON,Brown
|
82 |
+
2023-03-13,Okafor,Elena,Civil,CR-8622,150,new York,Brown
|
83 |
+
2023-05-27,Lee,Alice,Criminal,CR-9488,200,LOS angeles,Connor
|
84 |
+
2023-05-14,Patel,Alice,Civil,CR-4581,150,chicago,Brown
|
85 |
+
2023-06-27,Malcolm,Terrance,Criminal,CR-2388,250,chicago,James
|
86 |
+
2023-02-13,Ivanov,Terrance,Criminal,CR-6529,100,LOS angeles,Connor
|
87 |
+
2023-01-21,Patel,Terrance,Family,CR-4443,150,BOSTON,Oleg
|
88 |
+
2023-09-22,Malcolm,John,Civil,CR-8721,200,new York,Skaarsgard
|
89 |
+
2023-02-12,Malcolm,Miguel,Family,CR-3780,250,new York,Oleg
|
90 |
+
2023-04-26,Kim,Alan,Criminal,CR-2663,250,new York,Connor
|
91 |
+
2023-03-16,Ivanov,Lakshmi,Criminal,CR-8702,150,LOS angeles,Morgan
|
92 |
+
2023-07-22,Ivanov,Jane,Criminal,CR-1232,100,BOSTON,Connor
|
93 |
+
2023-05-28,Okafor,Nadia,Family,CR-5215,150,houston,Brown
|
94 |
+
2023-03-14,Okafor,Oluwaseun,Criminal,CR-6631,250,BOSTON,Skaarsgard
|
95 |
+
2023-06-17,Nasser,Alan,Civil,CR-1405,150,BOSTON,Morgan
|
96 |
+
2023-08-13,Kim,Oluwaseun,Civil,CR-8816,100,LOS angeles,James
|
97 |
+
2023-07-20,Brown,Oluwaseun,Family,CR-2665,150,new York,Morgan
|
98 |
+
2023-05-16,Patel,Alan,Family,CR-2874,100,new York,Connor
|
99 |
+
2023-07-15,Chatterjee,Nadia,Family,CR-2037,100,houston,James
|
100 |
+
2023-04-18,Johnson,Dmitri,Criminal,CR-5402,200,houston,Morgan
|
101 |
+
2023-08-14,Johnson,Chen,Civil,CR-3569,250,BOSTON,Morgan
|
src/data/synthetic/legal_entries_b.csv
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Date_of_Case,Fee,FullName,CaseNumber,CaseKind,Location,Weather
|
2 |
+
2023/01/16,250.0,Jane Okafor,case--6190,Civil,BOSTO,snowy
|
3 |
+
2023/08/10,100.0,Elena Malcolm,case--3092,Civil,CHIC,sunny
|
4 |
+
2023/06/14,150.0,Alan Nasser,case--5947,Civil,BOSTO,rainy
|
5 |
+
2023/07/17,250.0,Miguel Smith,case--7727,Family,LOSAN,snowy
|
6 |
+
2023/07/25,150.0,John Kim,case--4120,Criminal,BOSTO,rainy
|
7 |
+
2023/07/14,100.0,John Brown,case--8850,Civil,LOSAN,snowy
|
8 |
+
2023/01/19,100.0,Dmitri Nasser,case--2308,Criminal,CHIC,sunny
|
9 |
+
2023/02/26,100.0,Alan Rodriguez,case--4477,Criminal,CHIC,cloudy
|
10 |
+
2023/02/10,200.0,Alice Brown,case--9490,Criminal,CHICA,snowy
|
11 |
+
2023/09/12,100.0,Nadia Smith,case--4111,Family,LOSA,sunny
|
12 |
+
2023/02/25,150.0,Chen Kim,case--9421,Criminal,BOST,sunny
|
13 |
+
2023/09/15,200.0,John Kim,case--3270,Family,HOUST,sunny
|
14 |
+
2023/07/22,200.0,Nadia Patel,case--1501,Family,HOUS,cloudy
|
15 |
+
2023/01/27,150.0,Lakshmi Lee,case--8321,Family,HOUST,snowy
|
16 |
+
2023/01/14,100.0,John Brown,case--2748,Family,LOSAN,snowy
|
17 |
+
2023/07/13,100.0,Miguel Malcolm,case--3163,Family,LOSA,rainy
|
18 |
+
2023/02/26,150.0,Alice Smith,case--4296,Civil,BOSTO,cloudy
|
19 |
+
2023/09/25,200.0,Terrance Patel,case--2230,Criminal,HOUS,snowy
|
20 |
+
2023/02/13,100.0,Alan Ivanov,case--9353,Family,NEWY,sunny
|
21 |
+
2023/04/18,100.0,Alice Chatterjee,case--8786,Civil,CHICA,rainy
|
22 |
+
2023/09/11,100.0,Jane Brown,case--6001,Criminal,LOSA,snowy
|
23 |
+
2023/02/16,250.0,Jane Okafor,case--9434,Criminal,BOST,snowy
|
24 |
+
2023/07/22,100.0,Dmitri Chatterjee,case--1042,Criminal,BOST,rainy
|
25 |
+
2023/08/28,150.0,Miguel Smith,case--1427,Family,LOSA,cloudy
|
26 |
+
2023/06/14,200.0,Miguel Johnson,case--7553,Civil,CHIC,sunny
|
27 |
+
2023/02/24,250.0,Chen Ivanov,case--2242,Civil,LOSA,rainy
|
28 |
+
2023/06/23,250.0,Terrance Rodriguez,case--6940,Criminal,HOUST,rainy
|
29 |
+
2023/01/10,150.0,Elena Johnson,case--4064,Civil,HOUS,rainy
|
30 |
+
2023/01/15,100.0,Chen Patel,case--3129,Civil,NEWY,rainy
|
31 |
+
2023/08/16,150.0,Oluwaseun Malcolm,case--2758,Civil,BOSTO,snowy
|
32 |
+
2023/02/24,250.0,Lakshmi Ivanov,case--9562,Criminal,BOSTO,sunny
|
33 |
+
2023/05/15,250.0,Terrance Okafor,case--2292,Criminal,BOST,rainy
|
34 |
+
2023/06/26,250.0,Jane Patel,case--7889,Criminal,LOSAN,cloudy
|
35 |
+
2023/02/14,150.0,John Rodriguez,case--5178,Family,HOUS,sunny
|
36 |
+
2023/05/15,150.0,Terrance Patel,case--5004,Civil,HOUST,snowy
|
37 |
+
2023/03/19,200.0,Alice Johnson,case--2883,Family,NEWYO,snowy
|
38 |
+
2023/02/12,200.0,Alan Rodriguez,case--4416,Family,BOSTO,rainy
|
39 |
+
2023/07/25,200.0,Chen Malcolm,case--9332,Civil,HOUS,snowy
|
40 |
+
2023/09/15,250.0,Miguel Chatterjee,case--7699,Civil,BOST,rainy
|
41 |
+
2023/03/13,100.0,Nadia Lee,case--7258,Civil,NEWYO,snowy
|
42 |
+
2023/05/27,200.0,Nadia Brown,case--7490,Civil,HOUS,snowy
|
43 |
+
2023/02/22,100.0,Alice Johnson,case--8231,Civil,CHIC,cloudy
|
44 |
+
2023/03/18,100.0,Nadia Malcolm,case--2720,Criminal,NEWY,cloudy
|
45 |
+
2023/06/11,100.0,Nadia Brown,case--4277,Criminal,BOST,snowy
|
46 |
+
2023/02/22,100.0,Oluwaseun Okafor,case--9738,Criminal,NEWYO,snowy
|
47 |
+
2023/08/19,250.0,Jane Patel,case--2452,Civil,BOSTO,snowy
|
48 |
+
2023/09/27,100.0,Alan Lee,case--1899,Family,NEWY,rainy
|
49 |
+
2023/04/21,150.0,Dmitri Malcolm,case--8404,Family,LOSAN,rainy
|
50 |
+
2023/03/10,100.0,Alice Chatterjee,case--4240,Family,LOSA,snowy
|
51 |
+
2023/05/13,250.0,Elena Kim,case--6153,Family,CHIC,rainy
|
52 |
+
2023/09/10,200.0,Alan Patel,case--3485,Criminal,CHIC,cloudy
|
53 |
+
2023/08/18,200.0,Lakshmi Kim,case--5520,Criminal,LOSAN,sunny
|
54 |
+
2023/02/21,250.0,Alan Patel,case--9879,Criminal,LOSA,sunny
|
55 |
+
2023/05/12,200.0,Jane Brown,case--5259,Criminal,NEWYO,rainy
|
56 |
+
2023/01/20,100.0,Oluwaseun Patel,case--8333,Criminal,BOSTO,cloudy
|
57 |
+
2023/01/23,200.0,Chen Nasser,case--2711,Civil,LOSAN,sunny
|
58 |
+
2023/03/12,100.0,Miguel Brown,case--5100,Family,LOSAN,sunny
|
59 |
+
2023/01/15,100.0,Terrance Rodriguez,case--4849,Criminal,LOSAN,rainy
|
60 |
+
2023/05/17,150.0,Jane Lee,case--8058,Criminal,NEWY,cloudy
|
61 |
+
2023/04/18,100.0,Chen Okafor,case--9076,Civil,NEWYO,sunny
|
62 |
+
2023/02/22,200.0,Lakshmi Chatterjee,case--5230,Criminal,BOST,rainy
|
63 |
+
2023/08/18,200.0,John Brown,case--7094,Criminal,LOSA,cloudy
|
64 |
+
2023/08/17,150.0,Oluwaseun Lee,case--8915,Civil,BOSTO,sunny
|
65 |
+
2023/08/18,100.0,Alan Malcolm,case--9030,Family,CHIC,sunny
|
66 |
+
2023/02/13,150.0,Chen Malcolm,case--1482,Criminal,HOUS,cloudy
|
67 |
+
2023/02/16,100.0,John Brown,case--3535,Criminal,BOST,rainy
|
68 |
+
2023/08/20,250.0,Chen Johnson,case--2029,Criminal,HOUST,sunny
|
69 |
+
2023/01/10,250.0,Alan Kim,case--1812,Civil,HOUST,sunny
|
70 |
+
2023/02/18,150.0,Alice Chatterjee,case--5295,Civil,CHICA,snowy
|
71 |
+
2023/08/25,150.0,Miguel Lee,case--6850,Criminal,LOSA,sunny
|
72 |
+
2023/05/12,150.0,Alan Malcolm,case--7973,Criminal,BOST,cloudy
|
73 |
+
2023/05/19,200.0,Chen Johnson,case--5221,Family,HOUS,snowy
|
74 |
+
2023/06/17,250.0,John Okafor,case--4117,Criminal,BOSTO,sunny
|
75 |
+
2023/03/18,100.0,Elena Patel,case--2368,Family,HOUST,rainy
|
76 |
+
2023/06/22,200.0,Lakshmi Rodriguez,case--8384,Family,NEWY,cloudy
|
77 |
+
2023/07/14,100.0,Miguel Smith,case--4476,Civil,NEWYO,cloudy
|
78 |
+
2023/03/26,100.0,Chen Brown,case--4545,Civil,HOUST,snowy
|
79 |
+
2023/06/22,250.0,Dmitri Chatterjee,case--4421,Civil,HOUS,snowy
|
80 |
+
2023/03/20,150.0,Miguel Patel,case--6559,Criminal,NEWYO,snowy
|
81 |
+
2023/07/11,250.0,Oluwaseun Kim,case--1803,Civil,BOSTO,sunny
|
82 |
+
2023/03/13,150.0,Elena Okafor,case--8622,Civil,NEWYO,cloudy
|
83 |
+
2023/05/27,200.0,Alice Lee,case--9488,Criminal,LOSAN,cloudy
|
84 |
+
2023/05/14,150.0,Alice Patel,case--4581,Civil,CHICA,sunny
|
85 |
+
2023/06/27,250.0,Terrance Malcolm,case--2388,Criminal,CHIC,sunny
|
86 |
+
2023/02/13,100.0,Terrance Ivanov,case--6529,Criminal,LOSAN,snowy
|
87 |
+
2023/01/21,150.0,Terrance Patel,case--4443,Family,BOST,sunny
|
88 |
+
2023/09/22,200.0,John Malcolm,case--8721,Civil,NEWYO,snowy
|
89 |
+
2023/02/12,250.0,Miguel Malcolm,case--3780,Family,NEWYO,cloudy
|
90 |
+
2023/04/26,250.0,Alan Kim,case--2663,Criminal,NEWY,rainy
|
91 |
+
2023/03/16,150.0,Lakshmi Ivanov,case--8702,Criminal,LOSA,snowy
|
92 |
+
2023/07/22,100.0,Jane Ivanov,case--1232,Criminal,BOSTO,rainy
|
93 |
+
2023/05/28,150.0,Nadia Okafor,case--5215,Family,HOUS,cloudy
|
94 |
+
2023/03/14,250.0,Oluwaseun Okafor,case--6631,Criminal,BOST,rainy
|
95 |
+
2023/06/17,150.0,Alan Nasser,case--1405,Civil,BOST,snowy
|
96 |
+
2023/08/13,100.0,Oluwaseun Kim,case--8816,Civil,LOSAN,cloudy
|
97 |
+
2023/07/20,150.0,Oluwaseun Brown,case--2665,Family,NEWYO,sunny
|
98 |
+
2023/05/16,100.0,Alan Patel,case--2874,Family,NEWYO,sunny
|
99 |
+
2023/07/15,100.0,Nadia Chatterjee,case--2037,Family,HOUST,rainy
|
100 |
+
2023/04/18,200.0,Dmitri Johnson,case--5402,Criminal,HOUS,snowy
|
101 |
+
2023/08/14,250.0,Chen Johnson,case--3569,Civil,BOST,sunny
|
src/data/synthetic/legal_template.csv
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CaseDate,FullName,CaseType,CaseID,Fee,Jurisdiction
|
2 |
+
2023-01-16,Jane Okafor,Civil,CASE-6190,250,Boston
|
3 |
+
2023-08-10,Elena Malcolm,Civil,CASE-3092,100,Chicago
|
4 |
+
2023-06-14,Alan Nasser,Civil,CASE-5947,150,Boston
|
5 |
+
2023-07-17,Miguel Smith,Family,CASE-7727,250,Los Angeles
|
6 |
+
2023-07-25,John Kim,Criminal,CASE-4120,150,Boston
|
7 |
+
2023-07-14,John Brown,Civil,CASE-8850,100,Los Angeles
|
8 |
+
2023-01-19,Dmitri Nasser,Criminal,CASE-2308,100,Chicago
|
9 |
+
2023-02-26,Alan Rodriguez,Criminal,CASE-4477,100,Chicago
|
10 |
+
2023-02-10,Alice Brown,Criminal,CASE-9490,200,Chicago
|
11 |
+
2023-09-12,Nadia Smith,Family,CASE-4111,100,Los Angeles
|
12 |
+
2023-02-25,Chen Kim,Criminal,CASE-9421,150,Boston
|
13 |
+
2023-09-15,John Kim,Family,CASE-3270,200,Houston
|
14 |
+
2023-07-22,Nadia Patel,Family,CASE-1501,200,Houston
|
15 |
+
2023-01-27,Lakshmi Lee,Family,CASE-8321,150,Houston
|
16 |
+
2023-01-14,John Brown,Family,CASE-2748,100,Los Angeles
|
17 |
+
2023-07-13,Miguel Malcolm,Family,CASE-3163,100,Los Angeles
|
18 |
+
2023-02-26,Alice Smith,Civil,CASE-4296,150,Boston
|
19 |
+
2023-09-25,Terrance Patel,Criminal,CASE-2230,200,Houston
|
20 |
+
2023-02-13,Alan Ivanov,Family,CASE-9353,100,New York
|
21 |
+
2023-04-18,Alice Chatterjee,Civil,CASE-8786,100,Chicago
|
22 |
+
2023-09-11,Jane Brown,Criminal,CASE-6001,100,Los Angeles
|
23 |
+
2023-02-16,Jane Okafor,Criminal,CASE-9434,250,Boston
|
24 |
+
2023-07-22,Dmitri Chatterjee,Criminal,CASE-1042,100,Boston
|
25 |
+
2023-08-28,Miguel Smith,Family,CASE-1427,150,Los Angeles
|
26 |
+
2023-06-14,Miguel Johnson,Civil,CASE-7553,200,Chicago
|
27 |
+
2023-02-24,Chen Ivanov,Civil,CASE-2242,250,Los Angeles
|
28 |
+
2023-06-23,Terrance Rodriguez,Criminal,CASE-6940,250,Houston
|
29 |
+
2023-01-10,Elena Johnson,Civil,CASE-4064,150,Houston
|
30 |
+
2023-01-15,Chen Patel,Civil,CASE-3129,100,New York
|
31 |
+
2023-08-16,Oluwaseun Malcolm,Civil,CASE-2758,150,Boston
|
32 |
+
2023-02-24,Lakshmi Ivanov,Criminal,CASE-9562,250,Boston
|
33 |
+
2023-05-15,Terrance Okafor,Criminal,CASE-2292,250,Boston
|
34 |
+
2023-06-26,Jane Patel,Criminal,CASE-7889,250,Los Angeles
|
35 |
+
2023-02-14,John Rodriguez,Family,CASE-5178,150,Houston
|
36 |
+
2023-05-15,Terrance Patel,Civil,CASE-5004,150,Houston
|
37 |
+
2023-03-19,Alice Johnson,Family,CASE-2883,200,New York
|
38 |
+
2023-02-12,Alan Rodriguez,Family,CASE-4416,200,Boston
|
39 |
+
2023-07-25,Chen Malcolm,Civil,CASE-9332,200,Houston
|
40 |
+
2023-09-15,Miguel Chatterjee,Civil,CASE-7699,250,Boston
|
41 |
+
2023-03-13,Nadia Lee,Civil,CASE-7258,100,New York
|
42 |
+
2023-05-27,Nadia Brown,Civil,CASE-7490,200,Houston
|
43 |
+
2023-02-22,Alice Johnson,Civil,CASE-8231,100,Chicago
|
44 |
+
2023-03-18,Nadia Malcolm,Criminal,CASE-2720,100,New York
|
45 |
+
2023-06-11,Nadia Brown,Criminal,CASE-4277,100,Boston
|
46 |
+
2023-02-22,Oluwaseun Okafor,Criminal,CASE-9738,100,New York
|
47 |
+
2023-08-19,Jane Patel,Civil,CASE-2452,250,Boston
|
48 |
+
2023-09-27,Alan Lee,Family,CASE-1899,100,New York
|
49 |
+
2023-04-21,Dmitri Malcolm,Family,CASE-8404,150,Los Angeles
|
50 |
+
2023-03-10,Alice Chatterjee,Family,CASE-4240,100,Los Angeles
|
51 |
+
2023-05-13,Elena Kim,Family,CASE-6153,250,Chicago
|
52 |
+
2023-09-10,Alan Patel,Criminal,CASE-3485,200,Chicago
|
53 |
+
2023-08-18,Lakshmi Kim,Criminal,CASE-5520,200,Los Angeles
|
54 |
+
2023-02-21,Alan Patel,Criminal,CASE-9879,250,Los Angeles
|
55 |
+
2023-05-12,Jane Brown,Criminal,CASE-5259,200,New York
|
56 |
+
2023-01-20,Oluwaseun Patel,Criminal,CASE-8333,100,Boston
|
57 |
+
2023-01-23,Chen Nasser,Civil,CASE-2711,200,Los Angeles
|
58 |
+
2023-03-12,Miguel Brown,Family,CASE-5100,100,Los Angeles
|
59 |
+
2023-01-15,Terrance Rodriguez,Criminal,CASE-4849,100,Los Angeles
|
60 |
+
2023-05-17,Jane Lee,Criminal,CASE-8058,150,New York
|
61 |
+
2023-04-18,Chen Okafor,Civil,CASE-9076,100,New York
|
62 |
+
2023-02-22,Lakshmi Chatterjee,Criminal,CASE-5230,200,Boston
|
63 |
+
2023-08-18,John Brown,Criminal,CASE-7094,200,Los Angeles
|
64 |
+
2023-08-17,Oluwaseun Lee,Civil,CASE-8915,150,Boston
|
65 |
+
2023-08-18,Alan Malcolm,Family,CASE-9030,100,Chicago
|
66 |
+
2023-02-13,Chen Malcolm,Criminal,CASE-1482,150,Houston
|
67 |
+
2023-02-16,John Brown,Criminal,CASE-3535,100,Boston
|
68 |
+
2023-08-20,Chen Johnson,Criminal,CASE-2029,250,Houston
|
69 |
+
2023-01-10,Alan Kim,Civil,CASE-1812,250,Houston
|
70 |
+
2023-02-18,Alice Chatterjee,Civil,CASE-5295,150,Chicago
|
71 |
+
2023-08-25,Miguel Lee,Criminal,CASE-6850,150,Los Angeles
|
72 |
+
2023-05-12,Alan Malcolm,Criminal,CASE-7973,150,Boston
|
73 |
+
2023-05-19,Chen Johnson,Family,CASE-5221,200,Houston
|
74 |
+
2023-06-17,John Okafor,Criminal,CASE-4117,250,Boston
|
75 |
+
2023-03-18,Elena Patel,Family,CASE-2368,100,Houston
|
76 |
+
2023-06-22,Lakshmi Rodriguez,Family,CASE-8384,200,New York
|
77 |
+
2023-07-14,Miguel Smith,Civil,CASE-4476,100,New York
|
78 |
+
2023-03-26,Chen Brown,Civil,CASE-4545,100,Houston
|
79 |
+
2023-06-22,Dmitri Chatterjee,Civil,CASE-4421,250,Houston
|
80 |
+
2023-03-20,Miguel Patel,Criminal,CASE-6559,150,New York
|
81 |
+
2023-07-11,Oluwaseun Kim,Civil,CASE-1803,250,Boston
|
82 |
+
2023-03-13,Elena Okafor,Civil,CASE-8622,150,New York
|
83 |
+
2023-05-27,Alice Lee,Criminal,CASE-9488,200,Los Angeles
|
84 |
+
2023-05-14,Alice Patel,Civil,CASE-4581,150,Chicago
|
85 |
+
2023-06-27,Terrance Malcolm,Criminal,CASE-2388,250,Chicago
|
86 |
+
2023-02-13,Terrance Ivanov,Criminal,CASE-6529,100,Los Angeles
|
87 |
+
2023-01-21,Terrance Patel,Family,CASE-4443,150,Boston
|
88 |
+
2023-09-22,John Malcolm,Civil,CASE-8721,200,New York
|
89 |
+
2023-02-12,Miguel Malcolm,Family,CASE-3780,250,New York
|
90 |
+
2023-04-26,Alan Kim,Criminal,CASE-2663,250,New York
|
91 |
+
2023-03-16,Lakshmi Ivanov,Criminal,CASE-8702,150,Los Angeles
|
92 |
+
2023-07-22,Jane Ivanov,Criminal,CASE-1232,100,Boston
|
93 |
+
2023-05-28,Nadia Okafor,Family,CASE-5215,150,Houston
|
94 |
+
2023-03-14,Oluwaseun Okafor,Criminal,CASE-6631,250,Boston
|
95 |
+
2023-06-17,Alan Nasser,Civil,CASE-1405,150,Boston
|
96 |
+
2023-08-13,Oluwaseun Kim,Civil,CASE-8816,100,Los Angeles
|
97 |
+
2023-07-20,Oluwaseun Brown,Family,CASE-2665,150,New York
|
98 |
+
2023-05-16,Alan Patel,Family,CASE-2874,100,New York
|
99 |
+
2023-07-15,Nadia Chatterjee,Family,CASE-2037,100,Houston
|
100 |
+
2023-04-18,Dmitri Johnson,Criminal,CASE-5402,200,Houston
|
101 |
+
2023-08-14,Chen Johnson,Civil,CASE-3569,250,Boston
|
src/notebooks/table_mapping
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
,source_column_name,target_column_name,value_transformations,explanation
|
2 |
-
0,case_date,CaseDate,NO_TRANSFORM,The 'case_date' column in the source table maps directly to the 'CaseDate' column in the target table with no transformation needed.
|
3 |
-
1,lastname,FullName,"Concatenate 'lastname' and 'firstname' from source table, with a space in between, and reverse the order.",The 'lastname' and 'firstname' columns in the source table are combined and reversed to form the 'FullName' column in the target table.
|
4 |
-
2,firstname,FullName,"Concatenate 'lastname' and 'firstname' from source table, with a space in between, and reverse the order.",The 'lastname' and 'firstname' columns in the source table are combined and reversed to form the 'FullName' column in the target table.
|
5 |
-
3,case_type,CaseType,"Correct spelling errors in source table ('Familly' to 'Family', 'Criminl' to 'Criminal', 'Civl' to 'Civil').","The 'case_type' column in the source table maps to the 'CaseType' column in the target table, with spelling corrections needed."
|
6 |
-
4,case_id,CaseID,Replace 'CR-' prefix in source table with 'CASE-' prefix.,"The 'case_id' column in the source table maps to the 'CaseID' column in the target table, with a prefix change needed."
|
7 |
-
5,court_fee,Fee,NO_TRANSFORM,The 'court_fee' column in the source table maps directly to the 'Fee' column in the target table with no transformation needed.
|
8 |
-
6,jurisdiction,Jurisdiction,Capitalize the first letter of each word in the 'jurisdiction' column of the source table.,"The 'jurisdiction' column in the source table maps to the 'Jurisdiction' column in the target table, with capitalization needed."
|
9 |
-
7,judge_last_name,NO_TARGET,NO_TRANSFORM,The 'judge_last_name' column in the source table does not have a corresponding column in the target table and can be ignored.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/prompt.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DATA_SCIENTIST_PROMPT_STR = '''
|
2 |
+
You are a Data Scientist, who specializes in generating schema mappings for use by Software Engineers in ETL pipelines.
|
3 |
+
|
4 |
+
Head of `source_csv`:
|
5 |
+
|
6 |
+
{source_1_csv_str}
|
7 |
+
|
8 |
+
Head of `target_csv`:
|
9 |
+
|
10 |
+
{target_csv_str}
|
11 |
+
|
12 |
+
Your job is to generate a thorough, precise summary of how `source_csv` should be transformed to adhere exactly to the `target_csv` schema.
|
13 |
+
|
14 |
+
For each column in the `source_csv`, you must communicate which column in the `target_csv` it maps to, and how the values in the `source_csv` column should be transformed to match those in the `target_csv`.
|
15 |
+
You can assume the rows are aligned: that is, the first row in `source_csv` corresponds to the first row in `target_csv`, and so on.
|
16 |
+
|
17 |
+
Remember:
|
18 |
+
1. Which column in `target_csv` it maps to. You should consider the semantic meaning of the columns, not just the character similarity.
|
19 |
+
|
20 |
+
Example mappings:
|
21 |
+
- 'MunICipality' in `source_csv` should map to 'City' in `target_csv`.
|
22 |
+
- 'fullname' in `source_csv` should map to both 'FirstName' and 'LastName' in `target_csv`. You must explain this transformation, as well, including the target sequencing of first and last name.
|
23 |
+
|
24 |
+
Example transformations:
|
25 |
+
- If date in `source_csv` is `2020-01-01` and date in `target_csv` is `01/01/2020`, explain exactly how this should be transformed and the reasoning behind it.
|
26 |
+
- If city in `source_csv` is `New York` and city in `target_csv` is `NEW YORK` or `NYC`, explain exactly how this should be transformed and the reasoning behind it.
|
27 |
+
|
28 |
+
Lastly, point out any other oddities, such as duplicate columns, erroneous columns, etc.
|
29 |
+
|
30 |
+
{format_instructions}
|
31 |
+
|
32 |
+
Remember:
|
33 |
+
- Be concise: you are speaking to engineers, not customers.
|
34 |
+
- Be precise: all of these values are case sensitive. Consider casing for city names, exact prefixes for identifiers, ordering of people's names, etc.
|
35 |
+
- DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation
|
36 |
+
|
37 |
+
Your response:
|
38 |
+
'''
|
39 |
+
|
40 |
+
|
41 |
+
SPEC_WRITER_PROMPT_STR = '''
|
42 |
+
You are an expert product manager and technical writer for a software company, who generates clean, concise, precise specification documents for your employees.
|
43 |
+
Your job is to write a plaintext spec for a python script for a software engineer to develop a component within an ETL pipeline.
|
44 |
+
|
45 |
+
This document must include 100% of the information your employee needs to write a successful script to transform source_df to target_df.
|
46 |
+
However, DO NOT include the original table_mapping. Your job is to translate everything into natural language.
|
47 |
+
|
48 |
+
Here is a stringified pydantic object that describes the mapping and the transformation steps:
|
49 |
+
|
50 |
+
{table_mapping}
|
51 |
+
|
52 |
+
You must translate this into clean, concise, and complete instructions for your employee.
|
53 |
+
|
54 |
+
This document should be formatted like a technical document in plaintext. Do not include code or data.
|
55 |
+
|
56 |
+
This document must include:
|
57 |
+
- Overview
|
58 |
+
- Input (source_df), Output (target_df)
|
59 |
+
- Exact column mapping
|
60 |
+
- Exact transformation steps for each column
|
61 |
+
- Precise instructions for what this script should do
|
62 |
+
- Script input: Pandas Dataframe named `source_df`.
|
63 |
+
- Script output: Pandas Dataframe named `target_df`.
|
64 |
+
- Do not modify the source_df. Create a new dataframe named target_df.
|
65 |
+
- This script should never include the source data. It should only include the transormations required to create the target_df.
|
66 |
+
- Return the target_df.
|
67 |
+
|
68 |
+
You will never see this employee. They cannot contact you. You will never see their code. You must include 100% of the information they need to write a successful script.
|
69 |
+
Remember:
|
70 |
+
- Clean: No extra information, no formatting aside from plaintext
|
71 |
+
- Concise: Your employees benefit from brevity
|
72 |
+
- Precise: your words must be unambiguous, exact, and full represent a perfect translation of the table_mapping object.
|
73 |
+
|
74 |
+
Your response:
|
75 |
+
'''
|
76 |
+
|
77 |
+
|
78 |
+
ENGINEER_PROMPT_STR = '''
|
79 |
+
You are a Senior Software Engineer, who specializes in writing Python code for ETL pipelines.
|
80 |
+
Your Product Manager has written a spec for a new transormation script. You must follow this document exactly, write python code that implements the spec, validate that code, and then return it.
|
81 |
+
Your output should only be python code in Markdown format, eg:
|
82 |
+
```python
|
83 |
+
....
|
84 |
+
```"""
|
85 |
+
Do not return any additional text / explanation. This code will be executed by a robot without human intervention.
|
86 |
+
|
87 |
+
Here is the technical specification for your code:
|
88 |
+
|
89 |
+
{spec_str}
|
90 |
+
|
91 |
+
Remember: return only clean python code in markdown format. The python interpreter running this code will already have `source_df` as a local variable.
|
92 |
+
|
93 |
+
Your must return `target_df` at the end.
|
94 |
+
'''
|
src/types.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field
|
2 |
+
|
3 |
+
class TableMappingEntry(BaseModel):
|
4 |
+
'''A single row in a table mapping. Describes how a single column in a source table maps to a single column in a target table, including any necessary transformations, and their explanations.'''
|
5 |
+
source_column_name: str = Field(..., description="Name of the column in the source table.")
|
6 |
+
target_column_name: str = Field(..., description="Name of the column in the target table, to which the source column maps.")
|
7 |
+
value_transformations: str = Field(..., description="Transformations needed make the source values match the target values. If unncecessary, write 'NO_TRANSFORM'.")
|
8 |
+
explanation: str = Field(..., description="One-sentence explanation of this row (source-target mapping/transformation). Include any information that might be relevant to a software engineer building an ETL pipeline with this document.")
|
9 |
+
|
10 |
+
class TableMapping(BaseModel):
|
11 |
+
'''A list of table mappings collectively describe how a source table should be transformed to match the schema of a target table.'''
|
12 |
+
table_mappings: list[TableMappingEntry] = Field(..., description="A list of table mappings.")
|
src/vars.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
NUM_ROWS_TO_RETURN = 5
|