Steveeeeeeen's picture
Steveeeeeeen HF staff
Upload folder using huggingface_hub
9f0d781 verified
raw
history blame
2.6 kB
from __future__ import annotations
from collections.abc import Sequence
from string import ascii_letters, digits, hexdigits
from urllib.parse import quote as encode_uri_component
ASCII_LETTERS_AND_DIGITS = ascii_letters + digits
ENCODE_DEFAULT_CHARS = ";/?:@&=+$,-_.!~*'()#"
ENCODE_COMPONENT_CHARS = "-_.!~*'()"
encode_cache: dict[str, list[str]] = {}
# Create a lookup array where anything but characters in `chars` string
# and alphanumeric chars is percent-encoded.
def get_encode_cache(exclude: str) -> Sequence[str]:
if exclude in encode_cache:
return encode_cache[exclude]
cache: list[str] = []
encode_cache[exclude] = cache
for i in range(128):
ch = chr(i)
if ch in ASCII_LETTERS_AND_DIGITS:
# always allow unencoded alphanumeric characters
cache.append(ch)
else:
cache.append("%" + ("0" + hex(i)[2:].upper())[-2:])
for i in range(len(exclude)):
cache[ord(exclude[i])] = exclude[i]
return cache
# Encode unsafe characters with percent-encoding, skipping already
# encoded sequences.
#
# - string - string to encode
# - exclude - list of characters to ignore (in addition to a-zA-Z0-9)
# - keepEscaped - don't encode '%' in a correct escape sequence (default: true)
def encode(
string: str, exclude: str = ENCODE_DEFAULT_CHARS, *, keep_escaped: bool = True
) -> str:
result = ""
cache = get_encode_cache(exclude)
l = len(string) # noqa: E741
i = 0
while i < l:
code = ord(string[i])
# %
if keep_escaped and code == 0x25 and i + 2 < l:
if all(c in hexdigits for c in string[i + 1 : i + 3]):
result += string[i : i + 3]
i += 2
i += 1 # JS for loop statement3
continue
if code < 128:
result += cache[code]
i += 1 # JS for loop statement3
continue
if code >= 0xD800 and code <= 0xDFFF:
if code >= 0xD800 and code <= 0xDBFF and i + 1 < l:
next_code = ord(string[i + 1])
if next_code >= 0xDC00 and next_code <= 0xDFFF:
result += encode_uri_component(string[i] + string[i + 1])
i += 1
i += 1 # JS for loop statement3
continue
result += "%EF%BF%BD"
i += 1 # JS for loop statement3
continue
result += encode_uri_component(string[i])
i += 1 # JS for loop statement3
return result