Spaces:

WebashalarForML
/

ImageDataExtractor

Running

File size: 2,350 Bytes

fcd0a70

import json
from tqdm import tqdm
# ast.literal_eval
import ast, re

path = 'train.json'

with open(path, 'r') as f:
    data = json.load(f)

def tokenize_text(text):
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

def extract_entity_spans(entry):
    text = ""
    len_start = len("What describes ")
    len_end = len(" in the text?")
    entity_types = []
    entity_texts = []

    for c in entry['conversations']:
        if c['from'] == 'human' and c['value'].startswith('Text: '):
            text = c['value'][len('Text: '):]
            tokenized_text = tokenize_text(text)

        if c['from'] == 'human' and c['value'].startswith('What describes '):

            c_type = c['value'][len_start:-len_end]
            c_type = c_type.replace(' ', '_')
            entity_types.append(c_type)

        elif c['from'] == 'gpt' and c['value'].startswith('['):
            if c['value'] == '[]':
                entity_types = entity_types[:-1]
                continue

            texts_ents = ast.literal_eval(c['value'])
            # replace space to _ in texts_ents
            entity_texts.extend(texts_ents)
            num_repeat = len(texts_ents) - 1
            entity_types.extend([entity_types[-1]] * num_repeat)

    entity_spans = []
    for j, entity_text in enumerate(entity_texts):
        entity_tokens = tokenize_text(entity_text)
        matches = []
        for i in range(len(tokenized_text) - len(entity_tokens) + 1):
            if " ".join(tokenized_text[i:i + len(entity_tokens)]).lower() == " ".join(entity_tokens).lower():
                matches.append((i, i + len(entity_tokens) - 1, entity_types[j]))
        if matches:
            entity_spans.extend(matches)

    return entity_spans, tokenized_text

# Usage:
# Replace 'entry' with the specific entry from your JSON data
entry = data[17818]  # For example, taking the first entry
entity_spans, tokenized_text = extract_entity_spans(entry)
print("Entity Spans:", entity_spans)
#print("Tokenized Text:", tokenized_text)

# create a dict: {"tokenized_text": tokenized_text, "entity_spans": entity_spans}

all_data = []

for entry in tqdm(data):
    entity_spans, tokenized_text = extract_entity_spans(entry)
    all_data.append({"tokenized_text": tokenized_text, "ner": entity_spans})


with open('train_instruct.json', 'w') as f:
    json.dump(all_data, f)