# %% import pandas as pd df_i = pd.read_csv('/home/tosi-n/ark/data/jack_line_item_ner_task_v2.csv', sep='\t') df_ii = pd.read_csv('/home/tosi-n/ark/data/jack_line_item_ner_task.csv', sep='\t') display(df_i.head()) display(df_ii.head()) # %% df_i = df_i[['context', 'instruction', 'response']] df_ii = df_ii[['context', 'instruction', 'response']] df = pd.concat([df_i, df_ii]) df.rename(columns={'context': 'input', 'response': 'output'}, inplace=True) display(df.head()) # %% # check for nan values df.isna().sum() # %% # drop nan values df.dropna(inplace=True) # %% df.to_json('/home/tosi-n/ark/data/line_item_and_alm_data_v1.json', orient='records') # %%