NER_Tagger / normalizer.py
kandysh
first upload
06924e0
raw
history blame
No virus
938 Bytes
from pandas import json_normalize
import pandas as pd
import numpy as np
from iblou_merger import ent_merge
def word(df):
word_data = json_normalize(df, record_path=['words'])
word_data.rename(columns={0: 'words'}, inplace=True)
return word_data
def ent(df):
ent_data = json_normalize(df, record_path=['entities'], meta="text")
ent_data.rename(columns={0: 'entities'}, inplace=True)
return ent_data
def merge(word_data, ent_data):
return pd.merge(word_data, ent_data, left_index=True, right_index=True)
def process_df(df):
new_df = merge(word(df), ent(df))
new_df = pd.DataFrame(ent_merge(new_df), columns=["words", "entity"])
new_df["clean_words"] = new_df['words'].replace(r'[^\w\s]+', np.NAN, regex=True)
new_df["clean_entity"] = new_df['entity'].replace(r'^(?![\s\S])', np.NAN, regex=True)
new_df = new_df[new_df.any(axis=1)]
new_df.reset_index(drop=True)
return new_df