ark-instruct-line-item / data_prep.py
tosi-n7's picture
Upload folder using huggingface_hub
d8ffdc4
# %%
import pandas as pd
df_i = pd.read_csv('/home/tosi-n/ark/data/jack_line_item_ner_task_v2.csv', sep='\t')
df_ii = pd.read_csv('/home/tosi-n/ark/data/jack_line_item_ner_task.csv', sep='\t')
display(df_i.head())
display(df_ii.head())
# %%
df_i = df_i[['context', 'instruction', 'response']]
df_ii = df_ii[['context', 'instruction', 'response']]
df = pd.concat([df_i, df_ii])
df.rename(columns={'context': 'input', 'response': 'output'}, inplace=True)
display(df.head())
# %%
# check for nan values
df.isna().sum()
# %%
# drop nan values
df.dropna(inplace=True)
# %%
df.to_json('/home/tosi-n/ark/data/line_item_and_alm_data_v1.json', orient='records')
# %%