from pandas import json_normalize import pandas as pd import numpy as np from iblou_merger import ent_merge def word(df): word_data = json_normalize(df, record_path=['words']) word_data.rename(columns={0: 'words'}, inplace=True) return word_data def ent(df): ent_data = json_normalize(df, record_path=['entities'], meta="text") ent_data.rename(columns={0: 'entities'}, inplace=True) return ent_data def merge(word_data, ent_data): return pd.merge(word_data, ent_data, left_index=True, right_index=True) def process_df(df): new_df = merge(word(df), ent(df)) new_df = pd.DataFrame(ent_merge(new_df), columns=["words", "entity"]) new_df["clean_words"] = new_df['words'].replace(r'[^\w\s]+', np.NAN, regex=True) new_df["clean_entity"] = new_df['entity'].replace(r'^(?![\s\S])', np.NAN, regex=True) new_df = new_df[new_df.any(axis=1)] new_df.reset_index(drop=True) return new_df