File size: 938 Bytes
06924e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from pandas import json_normalize
import pandas as pd
import numpy as np
from iblou_merger import ent_merge


def word(df):
    word_data = json_normalize(df, record_path=['words'])
    word_data.rename(columns={0: 'words'}, inplace=True)
    return word_data


def ent(df):
    ent_data = json_normalize(df, record_path=['entities'], meta="text")
    ent_data.rename(columns={0: 'entities'}, inplace=True)
    return ent_data


def merge(word_data, ent_data):
    return pd.merge(word_data, ent_data, left_index=True, right_index=True)


def process_df(df):
    new_df = merge(word(df), ent(df))
    new_df = pd.DataFrame(ent_merge(new_df), columns=["words", "entity"])
    new_df["clean_words"] = new_df['words'].replace(r'[^\w\s]+', np.NAN, regex=True)
    new_df["clean_entity"] = new_df['entity'].replace(r'^(?![\s\S])', np.NAN, regex=True)
    new_df = new_df[new_df.any(axis=1)]
    new_df.reset_index(drop=True)
    return new_df