Spaces:

iscc
/

iscc-playground

Running

App Files Files Community

iscc-playground / app.py

titusz

Synced repo using 'sync_with_huggingface' Github Action

12a4666 verified about 1 year ago

raw

history blame

6.24 kB

	import io
	import base64
	import gradio as gr
	import iscc_core as ic
	import iscc_sdk as idk
	from PIL import Image

	idk.sdk_opts.image_thumbnail_size = 265
	idk.sdk_opts.image_thumbnail_quality = 80
	idk.sdk_opts.granular = True


	custom_css = """
	.fixed-height img {
	height: 265px; /* Fixed height */
	object-fit: contain; /* Scale the image to fit within the element */
	}
	#chunked-text span.label {
	text-transform: none !important;
	}
	"""

	newline_symbols = {
	"\u000a": "⏎", # Line Feed - Represented by the 'Return' symbol
	"\u000b": "↨", # Vertical Tab - Represented by the 'Up Down Arrow' symbol
	"\u000c": "␌", # Form Feed - Unicode Control Pictures representation
	"\u000d": "↵", # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol
	"\u0085": "⤓", # Next Line - 'Downwards Arrow with Double Stroke' symbol
	"\u2028": "↲", # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol
	"\u2029": "¶", # Paragraph Separator - Represented by the 'Pilcrow' symbol
	}


	def no_nl(text):
	for char, symbol in newline_symbols.items():
	text = text.replace(char, symbol)
	return text


	def generate_iscc(file):
	imeta = idk.code_iscc(file.name)
	thumbnail = None
	if imeta.thumbnail:
	header, encoded = imeta.thumbnail.split(",", 1)
	data = base64.b64decode(encoded)
	thumbnail = Image.open(io.BytesIO(data))
	metadata = imeta.dict(exclude_unset=False, by_alias=True)
	if metadata.get("thumbnail"):
	del metadata['thumbnail']
	return imeta.iscc, thumbnail, metadata


	def explain_iscc(code):
	canonical = ic.iscc_normalize(code)
	human = " - ".join(ic.iscc_explain(code).split("-"))
	code_obj = ic.Code(canonical)
	decomposed = " - ".join(ic.iscc_decompose(canonical))
	multiformat = code_obj.mf_base58btc
	return canonical, human, decomposed, multiformat


	def generate_text_code(text, chunk_size):
	original_chunk_size = idk.sdk_opts.text_avg_chunk_size
	idk.sdk_opts.text_avg_chunk_size = chunk_size
	cleaned = ic.text_clean(text)
	processed = idk.text_features(cleaned)
	features = processed["features"]
	sizes = processed["sizes"]
	start = 0
	chunks = []
	for size in sizes:
	end = start + size
	chunks.append(no_nl(cleaned[start:end]))
	start = end
	result = [(chunk, f"{size}:{feat}") for chunk, size, feat in zip(chunks, sizes, features)]
	idk.sdk_opts.text_avg_chunk_size = original_chunk_size
	return result


	with gr.Blocks(title="ISCC-CODE") as demo_generate:
	gr.Markdown("""
	## 🌟 ISCC-CODE Generator - The DNA of digital content
	""")
	with gr.Row():
	with gr.Column(scale=2):
	in_file = gr.File(label="Media File")
	with gr.Column(scale=1):
	out_thumbnail = gr.Image(label="Extracted Thumbnail", elem_classes=["fixed-height"])
	with gr.Row():
	out_iscc = gr.Text(label="ISCC-CODE", show_copy_button=True)
	with gr.Row():
	out_meta = gr.Json(label="Metadata")
	in_file.change(generate_iscc, inputs=[in_file], outputs=[out_iscc, out_thumbnail, out_meta])


	with gr.Blocks("ENCODING") as demo_decode:
	gr.Markdown("""
	## 🌟 A Codec for Self-Describing Compact Binary Codes
	""")
	with gr.Row():
	with gr.Column():
	in_iscc = gr.Text(
	label="ISCC",
	info="INPUT ANY VALID ISCC-CODE OR ISCC-UNIT",
	autofocus=True,
	)
	examples = [
	"ISCC:AAAWN77F727NXSUS", # Meta-Code
	"bzqaqaal5rvp72lx2thvq", # Multiformat
	"ISCC:EAASKDNZNYGUUF5A", # Text-Code
	"ISCC:GABW5LUBVP23N3DOD7PPINHT5JKBI", # Data-Code 128 bits
	"ISCC:KUAG5LUBVP23N3DOHCHWIYGXVN7ZS", # ISCC-SUM
	"ISCC:KAA2Y5NUST7BFD5NN2XIDK7VW3WG4OEPMRQNPK37TE", # ISCC-CDI
	"z36hVxiqoF8AAmDpZV958hn3tsv2i7v1NfCrSzpq", # ISCC-CDI multiformats
	"ISCC:KACT4EBWK27737D2AYCJRAL5Z36G76RFRMO4554RU26HZ4ORJGIVHDI",
	]
	gr.Examples(label="Example ISCCs", examples=examples, inputs=[in_iscc])

	gr.Markdown("## Different Encodings:")
	with gr.Row():
	with gr.Column():
	out_canonical = gr.Text(
	label="Canonical",
	info="NORMALIZED STANDARD REPRESENTATION",
	show_copy_button=True,
	)
	out_human = gr.Text(
	label="Human Readable",
	info="MAINTYPE - SUBTYPE - VERSION - LENGTH - BODY",
	show_copy_button=True,
	)
	out_decomposed = gr.Text(
	label="Decomposed",
	info="ISCC-UNITS",
	show_copy_button=True,
	)
	out_multiformat = gr.Text(
	label="Multiformat",
	info="BASE58-BTC",
	show_copy_button=True,
	)
	in_iscc.change(explain_iscc, inputs=[in_iscc], outputs=[
	out_canonical,
	out_human,
	out_decomposed,
	out_multiformat,
	])

	with gr.Blocks(title="CHUNKING") as demo_text_code:
	gr.Markdown("""
	## 🌟 Content Defined Chunking for Shift-Resistant Text and Data Segmentation
	""")
	with gr.Row():
	with gr.Column():
	in_text = gr.Textbox(label="Text Input", lines=8, autofocus=True)
	in_chunksize = gr.Slider(
	label="Chunk Size",
	info="AVERAGE NUMBER OF CHARACTERS PER CHUNK",
	minimum=32, maximum=2048, step=32,
	value=64)

	out_text = gr.HighlightedText(
	label="Chunked Text Output",
	interactive=False,
	elem_id="chunked-text",
	)
	in_text.change(generate_text_code, inputs=[in_text, in_chunksize], outputs=[out_text])
	in_chunksize.change(generate_text_code, inputs=[in_text, in_chunksize], outputs=[out_text])

	demo = gr.TabbedInterface(
	title="▶️ ISCC Playground",
	interface_list=[demo_generate, demo_decode, demo_text_code],
	tab_names=["ISCC-CODE", "ENCODING", "CHUNKING"],
	css=custom_css,
	)

	if __name__ == '__main__':
	demo.launch(debug=True, show_api=True)