Spaces:
Running
on
L4
Running
on
L4
import json | |
def extract_leaves(item, path=None, leaves=None): | |
""" | |
Extracts the leaves of a nested dictionary or list. | |
""" | |
if leaves is None: | |
leaves = [] | |
if path is None: | |
path = [] | |
if isinstance(item, dict): | |
for key, value in item.items(): | |
extract_leaves(value, path + [key], leaves) | |
elif isinstance(item, list): | |
for value in item: | |
extract_leaves(value, path, leaves) | |
else: | |
if item != '': | |
leaves.append((path, item)) | |
return leaves | |
def split_document(document, window_size, overlap, tokenizer): | |
""" | |
Splits a document into chunks of a specified window size with an overlap. | |
""" | |
tokens = tokenizer.tokenize(document) | |
print(f"\tLength of document: {len(tokens)} tokens") | |
chunks = [] | |
if len(tokens) > window_size: | |
for i in range(0, len(tokens), window_size-overlap): | |
print(f"\t{i} to {i + len(tokens[i:i + window_size])}") | |
chunk = tokenizer.convert_tokens_to_string(tokens[i:i + window_size]) | |
chunks.append(chunk) | |
if i + len(tokens[i:i + window_size]) >= len(tokens): | |
break | |
else: | |
chunks.append(document) | |
print(f"\tSplit into {len(chunks)} chunks") | |
return chunks | |
def handle_broken_output(pred, prev): | |
""" | |
Handles broken or empty JSON output by returning the previous prediction. | |
""" | |
try: | |
if all([(v in ["", []]) for v in json.loads(pred).values()]): | |
# if empty json, return previous | |
pred = prev | |
except: | |
# if broken json, return previous | |
pred = prev | |
return pred | |
def clean_json_text(text): | |
""" | |
Cleans JSON text by removing leading/trailing whitespace and escaping special characters. | |
""" | |
text = text.strip() | |
text = text.replace("\#", "#").replace("\&", "&") | |
return text | |
def sync_empty_fields(dict1, dict2): | |
""" | |
Synchronize empty fields between two dictionaries. | |
Adds empty fields to dict1 based on dict2, or removes them if they don't exist in dict2. | |
Args: | |
dict1 (dict): The dictionary to be modified. | |
dict2 (dict): The reference dictionary with empty fields to be synced. | |
Returns: | |
dict: The modified dict1 with synced empty fields. | |
""" | |
if not isinstance(dict1, dict) or not isinstance(dict2, dict): | |
return dict1 | |
# Traverse dict2 to add or remove empty fields in dict1 | |
for key, value in dict2.items(): | |
if isinstance(value, dict): # Handle nested dictionaries | |
dict1[key] = sync_empty_fields(dict1.get(key, {}), value) | |
elif value in (None, "", [], {}): # Empty field in dict2 | |
if key not in dict1: | |
dict1[key] = value # Add empty field to dict1 if not present | |
else: | |
if key in dict1 and dict1[key] in (None, "", [], {}): | |
del dict1[key] # Remove empty field from dict1 if not in dict2 | |
# Optionally, remove any extra fields in dict1 that are not in dict2 | |
keys_to_remove = [key for key in dict1 if key not in dict2] | |
for key in keys_to_remove: | |
del dict1[key] | |
return dict1 | |