iscc-playground / app.py
titusz's picture
Synced repo using 'sync_with_huggingface' Github Action
909172d verified
raw
history blame
6.21 kB
import io
import base64
import gradio as gr
import iscc_core as ic
import iscc_sdk as idk
from PIL import Image
idk.sdk_opts.image_thumbnail_size = 265
idk.sdk_opts.image_thumbnail_quality = 80
idk.sdk_opts.granular = True
custom_css = """
.fixed-height img {
height: 265px; /* Fixed height */
object-fit: contain; /* Scale the image to fit within the element */
}
#chunked-text span.label {
text-transform: none !important;
}
"""
newline_symbols = {
"\u000a": "⏎", # Line Feed - Represented by the 'Return' symbol
"\u000b": "↨", # Vertical Tab - Represented by the 'Up Down Arrow' symbol
"\u000c": "␌", # Form Feed - Unicode Control Pictures representation
"\u000d": "↡", # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol
"\u0085": "–", # Next Line - 'Downwards Arrow with Double Stroke' symbol
"\u2028": "↲", # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol
"\u2029": "ΒΆ", # Paragraph Separator - Represented by the 'Pilcrow' symbol
}
def no_nl(text):
for char, symbol in newline_symbols.items():
text = text.replace(char, symbol)
return text
def generate_iscc(file):
imeta = idk.code_iscc(file.name)
thumbnail = None
if imeta.thumbnail:
header, encoded = imeta.thumbnail.split(",", 1)
data = base64.b64decode(encoded)
thumbnail = Image.open(io.BytesIO(data))
metadata = imeta.dict(exclude_unset=False, by_alias=True)
if metadata.get("thumbnail"):
del metadata['thumbnail']
return imeta.iscc, thumbnail, metadata
def explain_iscc(code):
canonical = ic.iscc_normalize(code)
human = " - ".join(ic.iscc_explain(code).split("-"))
code_obj = ic.Code(canonical)
decomposed = " - ".join(ic.iscc_decompose(canonical))
multiformat = code_obj.mf_base58btc
return canonical, human, decomposed, multiformat
def generate_text_code(text, chunk_size):
original_chunk_size = idk.sdk_opts.text_avg_chunk_size
idk.sdk_opts.text_avg_chunk_size = chunk_size
cleaned = ic.text_clean(text)
processed = idk.text_features(cleaned)
features = processed["features"]
sizes = processed["sizes"]
start = 0
chunks = []
for size in sizes:
end = start + size
chunks.append(no_nl(cleaned[start:end]))
start = end
result = [(chunk, f"{size}:{feat}") for chunk, size, feat in zip(chunks, sizes, features)]
idk.sdk_opts.text_avg_chunk_size = original_chunk_size
return result
with gr.Blocks(title="ISCC-CODE") as demo_generate:
gr.Markdown("""
## 🌟 ISCC-CODE Generator - The DNA of digital content
""")
with gr.Row():
with gr.Column(scale=2):
in_file = gr.File(label="Media File")
with gr.Column(scale=1):
out_thumbnail = gr.Image(label="Extracted Thumbnail", elem_classes=["fixed-height"])
with gr.Row():
out_iscc = gr.Text(label="ISCC-CODE", show_copy_button=True)
with gr.Row():
out_meta = gr.Json(label="Metadata")
in_file.change(generate_iscc, inputs=[in_file], outputs=[out_iscc, out_thumbnail, out_meta])
with gr.Blocks("ENCODING") as demo_decode:
gr.Markdown("""
## 🌟 A Codec for Self-Describing Compact Binary Codes
""")
with gr.Row():
with gr.Column():
in_iscc = gr.Text(
label="ISCC",
info="INPUT ANY VALID ISCC-CODE OR ISCC-UNIT",
autofocus=True,
)
examples = [
"ISCC:AAAWN77F727NXSUS", # Meta-Code
"bzqaqaal5rvp72lx2thvq", # Multiformat
"ISCC:EAASKDNZNYGUUF5A", # Text-Code
"ISCC:GABW5LUBVP23N3DOD7PPINHT5JKBI", # Data-Code 128 bits
"ISCC:KUAG5LUBVP23N3DOHCHWIYGXVN7ZS", # ISCC-SUM
"ISCC:KAA2Y5NUST7BFD5NN2XIDK7VW3WG4OEPMRQNPK37TE", # ISCC-CDI
"z36hVxiqoF8AAmDpZV958hn3tsv2i7v1NfCrSzpq", # ISCC-CDI multiformats
"ISCC:KACT4EBWK27737D2AYCJRAL5Z36G76RFRMO4554RU26HZ4ORJGIVHDI",
]
gr.Examples(label="Example ISCCs", examples=examples, inputs=[in_iscc])
gr.Markdown("## Different Encodings:")
with gr.Row():
with gr.Column():
out_canonical = gr.Text(
label="Canonical",
info="NORMALIZED STANDARD REPRESENTATION",
show_copy_button=True,
)
out_human = gr.Text(
label="Human Readable",
info="MAINTYPE - SUBTYPE - VERSION - LENGTH - BODY",
show_copy_button=True,
)
out_decomposed = gr.Text(
label="Decomposed",
info="ISCC-UNITS",
show_copy_button=True,
)
out_multiformat = gr.Text(
label="Multiformat",
info="BASE58-BTC",
show_copy_button=True,
)
in_iscc.change(explain_iscc, inputs=[in_iscc], outputs=[
out_canonical,
out_human,
out_decomposed,
out_multiformat,
])
with gr.Blocks(title="CHUNKING") as demo_text_code:
gr.Markdown("""
## 🌟 Content Defined Chunking for Shift-Resistant Text and Data Segmentation
""")
with gr.Row():
with gr.Column():
in_text = gr.Textbox(label="Text Input", lines=8, autofocus=True)
in_chunksize = gr.Slider(
label="Chunk Size",
info="AVERAGE NUMBER OF CHARACTERS PER CHUNK",
minimum=32, maximum=2048, step=32,
value=64)
out_text = gr.HighlightedText(
label="Chunked Text Output",
interactive=False,
elem_id="chunked-text",
)
in_text.change(generate_text_code, inputs=[in_text, in_chunksize], outputs=[out_text])
in_chunksize.change(generate_text_code, inputs=[in_text, in_chunksize], outputs=[out_text])
demo = gr.TabbedInterface(
title="▢️ ISCC Playground",
interface_list=[demo_generate, demo_decode, demo_text_code],
tab_names=["ISCC-CODE", "ENCODING", "CHUNKING"],
css=custom_css,
)
if __name__ == '__main__':
demo.launch()