Spaces:
Running
on
L4
Running
on
L4
File size: 3,211 Bytes
8103be7 2283240 8103be7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import json
def extract_leaves(item, path=None, leaves=None):
"""
Extracts the leaves of a nested dictionary or list.
"""
if leaves is None:
leaves = []
if path is None:
path = []
if isinstance(item, dict):
for key, value in item.items():
extract_leaves(value, path + [key], leaves)
elif isinstance(item, list):
for value in item:
extract_leaves(value, path, leaves)
else:
if item != '':
leaves.append((path, item))
return leaves
def split_document(document, window_size, overlap, tokenizer):
"""
Splits a document into chunks of a specified window size with an overlap.
"""
tokens = tokenizer.tokenize(document)
print(f"\tLength of document: {len(tokens)} tokens")
chunks = []
if len(tokens) > window_size:
for i in range(0, len(tokens), window_size-overlap):
print(f"\t{i} to {i + len(tokens[i:i + window_size])}")
chunk = tokenizer.convert_tokens_to_string(tokens[i:i + window_size])
chunks.append(chunk)
if i + len(tokens[i:i + window_size]) >= len(tokens):
break
else:
chunks.append(document)
print(f"\tSplit into {len(chunks)} chunks")
return chunks
def handle_broken_output(pred, prev):
"""
Handles broken or empty JSON output by returning the previous prediction.
"""
try:
if all([(v in ["", []]) for v in json.loads(pred).values()]):
# if empty json, return previous
pred = prev
except:
# if broken json, return previous
pred = prev
return pred
def clean_json_text(text):
"""
Cleans JSON text by removing leading/trailing whitespace and escaping special characters.
"""
text = text.strip()
text = text.replace("\#", "#").replace("\&", "&")
return text
def sync_empty_fields(dict1, dict2):
"""
Synchronize empty fields between two dictionaries.
Adds empty fields to dict1 based on dict2, or removes them if they don't exist in dict2.
Args:
dict1 (dict): The dictionary to be modified.
dict2 (dict): The reference dictionary with empty fields to be synced.
Returns:
dict: The modified dict1 with synced empty fields.
"""
if not isinstance(dict1, dict) or not isinstance(dict2, dict):
return dict1
# Traverse dict2 to add or remove empty fields in dict1
for key, value in dict2.items():
if isinstance(value, dict): # Handle nested dictionaries
dict1[key] = sync_empty_fields(dict1.get(key, {}), value)
elif value in (None, "", [], {}): # Empty field in dict2
if key not in dict1:
dict1[key] = value # Add empty field to dict1 if not present
else:
if key in dict1 and dict1[key] in (None, "", [], {}):
del dict1[key] # Remove empty field from dict1 if not in dict2
# Optionally, remove any extra fields in dict1 that are not in dict2
keys_to_remove = [key for key in dict1 if key not in dict2]
for key in keys_to_remove:
del dict1[key]
return dict1
|