titusz commited on
Commit
f14de11
1 Parent(s): fb8bfcc

Synced repo using 'sync_with_huggingface' Github Action

Browse files
app.py CHANGED
@@ -1,212 +1,48 @@
1
- import io
2
- import base64
3
  import gradio as gr
4
- import iscc_core as ic
5
- import iscc_sdk as idk
6
- from PIL import Image
7
-
8
- idk.sdk_opts.image_thumbnail_size = 265
9
- idk.sdk_opts.image_thumbnail_quality = 80
10
- idk.sdk_opts.granular = True
11
-
12
 
13
  custom_css = """
14
- .fixed-height img {
15
- height: 265px; /* Fixed height */
16
  object-fit: contain; /* Scale the image to fit within the element */
17
  }
18
  #chunked-text span.label {
19
  text-transform: none !important;
20
  }
21
- """
22
-
23
- newline_symbols = {
24
- "\u000a": "⏎", # Line Feed - Represented by the 'Return' symbol
25
- "\u000b": "↨", # Vertical Tab - Represented by the 'Up Down Arrow' symbol
26
- "\u000c": "␌", # Form Feed - Unicode Control Pictures representation
27
- "\u000d": "↵", # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol
28
- "\u0085": "⤓", # Next Line - 'Downwards Arrow with Double Stroke' symbol
29
- "\u2028": "↲", # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol
30
- "\u2029": "¶", # Paragraph Separator - Represented by the 'Pilcrow' symbol
31
  }
32
 
 
 
 
 
33
 
34
- def no_nl(text):
35
- for char, symbol in newline_symbols.items():
36
- text = text.replace(char, symbol)
37
- return text
38
-
39
-
40
- def generate_iscc(file):
41
- imeta = idk.code_iscc(file.name)
42
- thumbnail = None
43
- if imeta.thumbnail:
44
- header, encoded = imeta.thumbnail.split(",", 1)
45
- data = base64.b64decode(encoded)
46
- thumbnail = Image.open(io.BytesIO(data))
47
- metadata = imeta.dict(exclude_unset=False, by_alias=True)
48
- if metadata.get("thumbnail"):
49
- del metadata["thumbnail"]
50
- return imeta.iscc, thumbnail, metadata
51
-
52
-
53
- def explain_iscc(code):
54
- canonical = ic.iscc_normalize(code)
55
- human = " - ".join(ic.iscc_explain(code).split("-"))
56
- code_obj = ic.Code(canonical)
57
- decomposed = " - ".join(ic.iscc_decompose(canonical))
58
- multiformat = code_obj.mf_base58btc
59
- return canonical, human, decomposed, multiformat
60
-
61
-
62
- def generate_text_code(text, chunk_size):
63
- original_chunk_size = idk.sdk_opts.text_avg_chunk_size
64
- idk.sdk_opts.text_avg_chunk_size = chunk_size
65
- cleaned = ic.text_clean(text)
66
- processed = idk.text_features(cleaned)
67
- features = processed["features"]
68
- sizes = processed["sizes"]
69
- start = 0
70
- chunks = []
71
- for size in sizes:
72
- end = start + size
73
- chunks.append(no_nl(cleaned[start:end]))
74
- start = end
75
- result = [
76
- (chunk, f"{size}:{feat}") for chunk, size, feat in zip(chunks, sizes, features)
77
- ]
78
- idk.sdk_opts.text_avg_chunk_size = original_chunk_size
79
- return result
80
-
81
-
82
- ####################################################################################################
83
- # TAB ISCC-CODE #
84
- ####################################################################################################
85
-
86
- with gr.Blocks() as demo_generate:
87
- gr.Markdown(
88
- """
89
- ## 🌟 ISCC-CODE Generator - The DNA of digital content
90
- """
91
- )
92
- with gr.Row():
93
- with gr.Column(scale=2):
94
- in_file = gr.File(label="Media File")
95
- with gr.Column(scale=1):
96
- out_thumbnail = gr.Image(
97
- label="Extracted Thumbnail", elem_classes=["fixed-height"]
98
- )
99
- with gr.Row():
100
- out_iscc = gr.Text(label="ISCC-CODE", show_copy_button=True)
101
- with gr.Row():
102
- out_meta = gr.Json(label="Metadata")
103
- in_file.change(
104
- generate_iscc, inputs=[in_file], outputs=[out_iscc, out_thumbnail, out_meta]
105
- )
106
-
107
- ####################################################################################################
108
- # TAB ENCODING #
109
- ####################################################################################################
110
-
111
- with gr.Blocks() as demo_decode:
112
- gr.Markdown(
113
- """
114
- ## 🌟 A Codec for Self-Describing Compact Binary Codes
115
- """
116
- )
117
- with gr.Row():
118
- with gr.Column():
119
- in_iscc = gr.Text(
120
- label="ISCC",
121
- info="INPUT ANY VALID ISCC-CODE OR ISCC-UNIT",
122
- autofocus=True,
123
- )
124
- examples = [
125
- "ISCC:AAAWN77F727NXSUS", # Meta-Code
126
- "bzqaqaal5rvp72lx2thvq", # Multiformat
127
- "ISCC:EAASKDNZNYGUUF5A", # Text-Code
128
- "ISCC:GABW5LUBVP23N3DOD7PPINHT5JKBI", # Data-Code 128 bits
129
- "ISCC:KUAG5LUBVP23N3DOHCHWIYGXVN7ZS", # ISCC-SUM
130
- "ISCC:KAA2Y5NUST7BFD5NN2XIDK7VW3WG4OEPMRQNPK37TE", # ISCC-CDI
131
- "z36hVxiqoF8AAmDpZV958hn3tsv2i7v1NfCrSzpq", # ISCC-CDI multiformats
132
- "ISCC:KACT4EBWK27737D2AYCJRAL5Z36G76RFRMO4554RU26HZ4ORJGIVHDI",
133
- ]
134
- gr.Examples(label="Example ISCCs", examples=examples, inputs=[in_iscc])
135
-
136
- gr.Markdown("## Different Encodings:")
137
- with gr.Row():
138
- with gr.Column():
139
- out_canonical = gr.Text(
140
- label="Canonical",
141
- info="NORMALIZED STANDARD REPRESENTATION",
142
- show_copy_button=True,
143
- )
144
- out_human = gr.Text(
145
- label="Human Readable",
146
- info="MAINTYPE - SUBTYPE - VERSION - LENGTH - BODY",
147
- show_copy_button=True,
148
- )
149
- out_decomposed = gr.Text(
150
- label="Decomposed",
151
- info="ISCC-UNITS",
152
- show_copy_button=True,
153
- )
154
- out_multiformat = gr.Text(
155
- label="Multiformat",
156
- info="BASE58-BTC",
157
- show_copy_button=True,
158
- )
159
- in_iscc.change(
160
- explain_iscc,
161
- inputs=[in_iscc],
162
- outputs=[
163
- out_canonical,
164
- out_human,
165
- out_decomposed,
166
- out_multiformat,
167
- ],
168
- )
169
 
170
- ####################################################################################################
171
- # CHUNKING #
172
- ####################################################################################################
173
 
174
- with gr.Blocks() as demo_chunking:
175
- gr.Markdown(
176
- """
177
- ## 🌟 Content Defined Chunking for Shift-Resistant Text and Data Segmentation
178
- """
179
- )
180
- with gr.Row():
181
- with gr.Column():
182
- in_text = gr.Textbox(label="Text Input", lines=8, autofocus=True)
183
- in_chunksize = gr.Slider(
184
- label="Chunk Size",
185
- info="AVERAGE NUMBER OF CHARACTERS PER CHUNK",
186
- minimum=32,
187
- maximum=2048,
188
- step=32,
189
- value=64,
190
- )
191
 
192
- out_text = gr.HighlightedText(
193
- label="Chunked Text Output",
194
- interactive=False,
195
- elem_id="chunked-text",
196
- )
197
- in_text.change(
198
- generate_text_code, inputs=[in_text, in_chunksize], outputs=[out_text]
199
- )
200
- in_chunksize.change(
201
- generate_text_code, inputs=[in_text, in_chunksize], outputs=[out_text]
202
- )
203
 
204
  demo = gr.TabbedInterface(
205
- title="▶️ ISCC Playground",
206
- interface_list=[demo_generate, demo_decode, demo_chunking],
207
- tab_names=["ISCC-CODE", "ENCODING", "CHUNKING"],
208
  css=custom_css,
 
209
  )
210
 
 
211
  if __name__ == "__main__":
212
  demo.launch()
 
 
 
1
  import gradio as gr
2
+ from demos.generate import demo as demo_generate
3
+ from demos.compare import demo as demo_compare
4
+ from demos.inspect_ import demo as demo_inspect
5
+ from demos.chunker import demo as demo_chunker
 
 
 
 
6
 
7
  custom_css = """
8
+ .fixed-height {
9
+ height: 240px; /* Fixed height */
10
  object-fit: contain; /* Scale the image to fit within the element */
11
  }
12
  #chunked-text span.label {
13
  text-transform: none !important;
14
  }
15
+ .json-holder {
16
+ word-wrap: break-word;
17
+ white-space: pre-wrap;
 
 
 
 
 
 
 
18
  }
19
 
20
+ #examples-a, #examples-b {
21
+ height: 140px; /* Fixed height */
22
+ object-fit: contain; /* Scale the image to fit within the element */
23
+ }
24
 
25
+ textarea {
26
+ font-family: JetBrains Mono;
27
+ }
28
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
 
 
30
 
31
+ iscc_theme = gr.themes.Default(
32
+ font=gr.themes.GoogleFont("Readex Pro"),
33
+ font_mono=gr.themes.GoogleFont("JetBrains Mono"),
34
+ radius_size=gr.themes.sizes.radius_none,
35
+ )
 
 
 
 
 
 
 
 
 
 
 
 
36
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  demo = gr.TabbedInterface(
39
+ title="▶️ ISCC Playground - The DNA of your digital content",
40
+ interface_list=[demo_generate, demo_compare, demo_inspect, demo_chunker],
41
+ tab_names=["GENERATE", "COMPARE", "INSPECT", "CHUNKER"],
42
  css=custom_css,
43
+ theme=iscc_theme,
44
  )
45
 
46
+
47
  if __name__ == "__main__":
48
  demo.launch()
demos/__init__.py ADDED
File without changes
demos/chunker.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import iscc_core as ic
3
+ import iscc_sdk as idk
4
+ import pathlib
5
+
6
+
7
+ HERE = pathlib.Path(__file__).parent.absolute()
8
+ SAMPLE_FILEPATH = HERE / "samples/sample.txt"
9
+ sample_text = open(SAMPLE_FILEPATH, "rt", encoding="utf-8").read()
10
+
11
+ newline_symbols = {
12
+ "\u000a": "⏎", # Line Feed - Represented by the 'Return' symbol
13
+ "\u000b": "↨", # Vertical Tab - Represented by the 'Up Down Arrow' symbol
14
+ "\u000c": "␌", # Form Feed - Unicode Control Pictures representation
15
+ "\u000d": "↵", # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol
16
+ "\u0085": "⤓", # Next Line - 'Downwards Arrow with Double Stroke' symbol
17
+ "\u2028": "↲", # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol
18
+ "\u2029": "¶", # Paragraph Separator - Represented by the 'Pilcrow' symbol
19
+ }
20
+
21
+ custom_css = """
22
+ #chunked-text span.label {
23
+ text-transform: none !important;
24
+ }
25
+ """
26
+
27
+
28
+ def no_nl(text):
29
+ """Replace non-printable newline characters with printable symbols"""
30
+ for char, symbol in newline_symbols.items():
31
+ text = text.replace(char, symbol)
32
+ return text
33
+
34
+
35
+ def chunk_text(text, chunk_size):
36
+ original_chunk_size = idk.sdk_opts.text_avg_chunk_size
37
+ idk.sdk_opts.text_avg_chunk_size = chunk_size
38
+ cleaned = ic.text_clean(text)
39
+ processed = idk.text_features(cleaned)
40
+ features = processed["features"]
41
+ sizes = processed["sizes"]
42
+ start = 0
43
+ chunks = []
44
+ for size in sizes:
45
+ end = start + size
46
+ chunks.append(no_nl(cleaned[start:end]))
47
+ start = end
48
+ result = [
49
+ (chunk, f"{size}:{feat}") for chunk, size, feat in zip(chunks, sizes, features)
50
+ ]
51
+ idk.sdk_opts.text_avg_chunk_size = original_chunk_size
52
+ return result
53
+
54
+
55
+ with gr.Blocks(css=custom_css) as demo:
56
+ with gr.Row(variant="panel"):
57
+ gr.Markdown(
58
+ """
59
+ ## ✂️ ISCC Chunker
60
+ Demo of Content-Defined Variable-Length Chunking for Shift-Resistant Text and Data Segmentation
61
+ """,
62
+ )
63
+ with gr.Row(variant="panel"):
64
+ with gr.Column(variant="panel"):
65
+ in_text = gr.TextArea(
66
+ label="Text Chunker",
67
+ placeholder="Paste your text here",
68
+ lines=12,
69
+ max_lines=12,
70
+ )
71
+ in_chunksize = gr.Slider(
72
+ label="Chunk Size",
73
+ info="AVERAGE NUMBER OF CHARACTERS PER CHUNK",
74
+ minimum=64,
75
+ maximum=2048,
76
+ step=32,
77
+ value=64,
78
+ )
79
+ gr.Examples(label="Sample Text", examples=[sample_text], inputs=[in_text])
80
+
81
+ out_text = gr.HighlightedText(
82
+ label="Chunked Text Output",
83
+ interactive=False,
84
+ elem_id="chunked-text",
85
+ )
86
+ with gr.Row():
87
+ gr.ClearButton(components=[in_text, in_chunksize, out_text])
88
+ with gr.Row(variant="panel"):
89
+ gr.Markdown(
90
+ """
91
+ ## 📖 Help & Instructions
92
+
93
+ This Demo showcases ISCC's shift-resistant chunking algorithm. Here's how to use it:
94
+
95
+ A) **Paste your text** into the "Text Chunker" field or select the sample below.
96
+
97
+ The **"Chunked Text Output"** will display the results, highlighting each chunk and its
98
+ number of characters and associated similarity hash.
99
+
100
+ B) Edit the text** in the "Text Chunker" field
101
+
102
+ Observe how most chunks stay the same (same length and same hash) even if you make edits
103
+ in the beginning of the text.
104
+
105
+ C) **Adjust the "Chunk Size"** slider to control the average number of characters per chunk.
106
+
107
+ Observe how the chunks get smaller/larger on average. Smaller sizes result in more,
108
+ more fine grained chunks, while larger sizes produce fewer, larger chunks on average.
109
+
110
+ D) Use the **Clear Button** to start over.
111
+
112
+ For more information about ISCC chunking, please visit: https://core.iscc.codes/algorithms/cdc/
113
+ """,
114
+ )
115
+
116
+ gr.Markdown(
117
+ """
118
+ ## What is Content-Defined Chunking?
119
+
120
+ This method segments text (or data) into chunks using a content-defined approach, which is
121
+ resilient to shifts in the text. It ensures that changes in the beginning of the text have
122
+ minimal impact on the chunk boundaries further in the text, making it ideal for version
123
+ control, data deduplication, and similar applications where detecting content changes
124
+ efficiently is crucial.
125
+
126
+ ## How does ISCC use Content-Defined Chunking?
127
+
128
+ The [Data-Code](https://github.com/iscc/iscc-core/blob/main/iscc_core/code_data.py) is
129
+ generated by chunking the raw file bitstream with an average chunk size of 1024 bytes.
130
+ The chunks are hashed with `xxhash` and processed with a `minhash` algorithm.
131
+
132
+ It is also used by the [iscc-sdk](https://github.com/iscc/iscc-sdk) to generate granular
133
+ syntactic similarity hashes for textual content with an average chunk size of 1024
134
+ characters. When activated the granular chunk hashes are attached to the generated ISCC
135
+ Metadata.
136
+ """
137
+ )
138
+
139
+ in_text.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text])
140
+ in_chunksize.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text])
141
+
142
+
143
+ if __name__ == "__main__":
144
+ demo.launch()
demos/compare.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ from loguru import logger as log
4
+ from pathlib import Path
5
+ import gradio as gr
6
+ from PIL import Image
7
+ import iscc_core as ic
8
+ import iscc_sdk as idk
9
+ import iscc_sci as sci
10
+ import plotly.graph_objects as go
11
+ import pandas as pd
12
+
13
+
14
+ idk.sdk_opts.image_thumbnail_size = 265
15
+ idk.sdk_opts.image_thumbnail_quality = 80
16
+
17
+
18
+ HERE = Path(__file__).parent.absolute()
19
+ IMAGES1 = HERE / "images1"
20
+ IMAGES2 = HERE / "images2"
21
+
22
+
23
+ custom_css = """
24
+ .fixed-height {
25
+ height: 240px; /* Fixed height */
26
+ object-fit: contain; /* Scale the image to fit within the element */
27
+ }
28
+
29
+ #examples-a, #examples-b {
30
+ height: 140px; /* Fixed height */
31
+ object-fit: contain; /* Scale the image to fit within the element */
32
+ }
33
+ """
34
+
35
+
36
+ def iscc_semantic(filepath: str) -> idk.IsccMeta:
37
+ """Generate ISCC-CODE extended with Semantic-Code for supported modalities (Image)"""
38
+ imeta = idk.code_iscc(filepath)
39
+ if imeta.mode == "image":
40
+ # Inject Semantic-Code
41
+ sci_code = sci.code_image_semantic(filepath, bits=64)["iscc"]
42
+ units = ic.iscc_decompose(imeta.iscc)
43
+ units.append(sci_code)
44
+ iscc_code_s = ic.gen_iscc_code(units)["iscc"]
45
+ imeta.iscc = iscc_code_s
46
+ return imeta
47
+
48
+
49
+ def dist_to_sim(data, dim=64):
50
+ result = {}
51
+ for k, v in data.items():
52
+ if k == "instance_match":
53
+ result[k.split("_")[0].title()] = 1.0 if v is True else -1.0
54
+ else:
55
+ result[k.split("_")[0].title()] = hamming_to_cosine(v, dim)
56
+ return result
57
+
58
+
59
+ def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
60
+ """Aproximate the cosine similarity for a given hamming distance and dimension"""
61
+ result = 1 - (2 * hamming_distance) / dim
62
+ log.debug(f"Hamming distance: {hamming_distance} - Dim: {dim} - Result: {result}")
63
+ return result
64
+
65
+
66
+ def similarity_plot(sim_data):
67
+ # type: (dict) -> go.Figure
68
+ # Convert input dictionary to DataFrame, sort by value for visual consistency
69
+ data_df = pd.DataFrame(reversed(sim_data.items()), columns=["Category", "Value"])
70
+ data_df["Percentage"] = data_df["Value"] * 100 # Convert to percentage
71
+
72
+ # Define color for bars based on value
73
+ # data_df["Color"] = ["red" if x < 0 else "green" for x in data_df["Value"]]
74
+ data_df["Color"] = [
75
+ f"rgba(224,122,95,{abs(x)})" if x < 0 else f"rgba(118,185,71,{x})"
76
+ for x in data_df["Value"]
77
+ ]
78
+
79
+ # Create Plotly Figure
80
+ fig = go.Figure()
81
+ fig.add_trace(
82
+ go.Bar(
83
+ x=data_df["Value"],
84
+ y=data_df["Category"],
85
+ orientation="h",
86
+ marker_color=data_df["Color"],
87
+ text=data_df["Percentage"].apply(lambda x: f"{x:.2f}%"),
88
+ textposition="inside",
89
+ )
90
+ ) # Change made here
91
+
92
+ # Update layout for aesthetics
93
+ fig.update_layout(
94
+ title={"text": "Approximate ISCC-UNIT Similarities", "x": 0.5},
95
+ xaxis=dict(title="Similarity", tickformat=",.0%"),
96
+ yaxis=dict(title=""),
97
+ plot_bgcolor="rgba(0,0,0,0)",
98
+ height=len(sim_data) * 70,
99
+ showlegend=False,
100
+ autosize=True,
101
+ margin=dict(l=50, r=50, t=50, b=50),
102
+ )
103
+
104
+ # Adjust the x-axis to accommodate percentage labels
105
+ fig.update_xaxes(range=[-1.1, 1.1])
106
+
107
+ return fig
108
+
109
+
110
+ with gr.Blocks(css=custom_css) as demo:
111
+ gr.Markdown("## 🖼️ ISCC Similarity Comparison")
112
+
113
+ with gr.Row(variant="default", equal_height=True):
114
+ with gr.Column(variant="compact"):
115
+ in_file_a = gr.File(
116
+ label="Media File A", type="filepath", elem_classes=["fixed-height"]
117
+ )
118
+ out_thumb_a = gr.Image(
119
+ label="Extracted Thumbnail",
120
+ visible=False,
121
+ height=240,
122
+ elem_classes=["fixed-height"],
123
+ interactive=True,
124
+ show_download_button=False,
125
+ sources=["upload"],
126
+ )
127
+
128
+ # Proxy component to patch image example selection -> gr.File
129
+ dumy_image_a = gr.Image(visible=False, type="filepath", height=240)
130
+
131
+ gr.Examples(
132
+ examples=IMAGES1.as_posix(),
133
+ cache_examples=False,
134
+ inputs=[dumy_image_a],
135
+ elem_id="examples-a",
136
+ )
137
+
138
+ out_iscc_a = gr.Text(label="ISCC")
139
+ with gr.Accordion(label="ISCC Metadata", open=False):
140
+ out_meta_a = gr.Code(language="json", label="JSON-LD")
141
+
142
+ with gr.Column(variant="compact"):
143
+ in_file_b = gr.File(
144
+ label="Media File B", type="filepath", elem_classes=["fixed-height"]
145
+ )
146
+
147
+ out_thumb_b = gr.Image(
148
+ label="Extracted Thumbnail",
149
+ visible=False,
150
+ height=240,
151
+ elem_classes=["fixed-height"],
152
+ interactive=True,
153
+ show_download_button=False,
154
+ sources=["upload"],
155
+ )
156
+
157
+ # Proxy component to patch image example selection -> gr.File
158
+ dumy_image_b = gr.Image(visible=False, type="filepath", height=240)
159
+
160
+ gr.Examples(
161
+ examples=IMAGES2.as_posix(),
162
+ cache_examples=False,
163
+ inputs=[dumy_image_b],
164
+ elem_id="examples-b",
165
+ )
166
+
167
+ out_iscc_b = gr.Text(label="ISCC")
168
+ with gr.Accordion(label="ISCC Metadata", open=False):
169
+ out_meta_b = gr.Code(language="json", label="JSON-LD")
170
+
171
+ with gr.Row(variant="panel"):
172
+ out_compare = gr.Plot(
173
+ label="Approximate ISCC-UNIT Similarities", container=False
174
+ )
175
+
176
+ def rewrite_uri(filepath, sample_set):
177
+ # type: (str, str) -> str
178
+ """Rewrites temporary image URI to original sample URI"""
179
+ if filepath:
180
+ inpath = Path(filepath)
181
+ outpath = HERE / f"{sample_set}/{inpath.name.replace('jpeg', 'jpg')}"
182
+
183
+ log.info(filepath)
184
+ return outpath.as_posix()
185
+
186
+ def process_upload(filepath, suffix):
187
+ # type: (str, str) -> dict
188
+ """Generate extended ISCC with experimental Semantic Code (for images)"""
189
+
190
+ # Map to active component group
191
+ in_file_func = globals().get(f"in_file_{suffix}")
192
+ out_thumb_func = globals().get(f"out_thumb_{suffix}")
193
+ out_iscc_func = globals().get(f"out_iscc_{suffix}")
194
+ out_meta_func = globals().get(f"out_meta_{suffix}")
195
+
196
+ # Handle emtpy filepath
197
+ if not filepath:
198
+ return {
199
+ in_file_func: None,
200
+ }
201
+
202
+ imeta = iscc_semantic(filepath)
203
+
204
+ # Pop Thumbnail for Preview
205
+ thumbnail = None
206
+ if imeta.thumbnail:
207
+ header, encoded = imeta.thumbnail.split(",", 1)
208
+ data = base64.b64decode(encoded)
209
+ thumbnail = Image.open(io.BytesIO(data))
210
+ imeta.thumbnail = None
211
+
212
+ result = {
213
+ in_file_func: gr.File(visible=False, value=None),
214
+ out_thumb_func: gr.Image(visible=True, value=thumbnail),
215
+ out_iscc_func: imeta.iscc,
216
+ out_meta_func: imeta.json(exclude_unset=False, by_alias=True, indent=2),
217
+ }
218
+
219
+ return result
220
+
221
+ def iscc_compare(iscc_a, iscc_b):
222
+ # type: (str, str) -> dict | None
223
+ """Compare two ISCCs"""
224
+ if not all([iscc_a, iscc_b]):
225
+ return None
226
+ dist_data = ic.iscc_compare(iscc_a, iscc_b)
227
+ sim_data = dist_to_sim(dist_data, dim=64)
228
+ sim_plot = similarity_plot(sim_data)
229
+ return sim_plot
230
+
231
+ # Events
232
+ in_file_a.change(
233
+ lambda file: process_upload(file, "a"),
234
+ inputs=[in_file_a],
235
+ outputs=[in_file_a, out_thumb_a, out_iscc_a, out_meta_a],
236
+ show_progress="full",
237
+ )
238
+ in_file_b.change(
239
+ lambda file: process_upload(file, "b"),
240
+ inputs=[in_file_b],
241
+ outputs=[in_file_b, out_thumb_b, out_iscc_b, out_meta_b],
242
+ show_progress="full",
243
+ )
244
+ out_thumb_a.clear(
245
+ lambda: (gr.File(visible=True), gr.Image(visible=False), "", ""),
246
+ inputs=[],
247
+ outputs=[in_file_a, out_thumb_a, out_iscc_a, out_meta_a],
248
+ show_progress="hidden",
249
+ )
250
+
251
+ out_thumb_b.clear(
252
+ lambda: (gr.File(visible=True), gr.Image(visible=False), "", ""),
253
+ inputs=[],
254
+ outputs=[in_file_b, out_thumb_b, out_iscc_b, out_meta_b],
255
+ show_progress="hidden",
256
+ )
257
+
258
+ out_iscc_a.change(
259
+ iscc_compare,
260
+ inputs=[out_iscc_a, out_iscc_b],
261
+ outputs=[out_compare],
262
+ show_progress="hidden",
263
+ )
264
+
265
+ out_iscc_b.change(
266
+ iscc_compare,
267
+ inputs=[out_iscc_a, out_iscc_b],
268
+ outputs=[out_compare],
269
+ show_progress="hidden",
270
+ )
271
+
272
+ dumy_image_a.change(
273
+ lambda file: rewrite_uri(file, "images1"),
274
+ inputs=[dumy_image_a],
275
+ outputs=[in_file_a],
276
+ show_progress="hidden",
277
+ )
278
+ dumy_image_b.change(
279
+ lambda file: rewrite_uri(file, "images2"),
280
+ inputs=[dumy_image_b],
281
+ outputs=[in_file_b],
282
+ show_progress="hidden",
283
+ )
284
+
285
+
286
+ if __name__ == "__main__":
287
+ demo.launch(debug=True)
demos/generate.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import base64
3
+ import io
4
+ import gradio as gr
5
+ import iscc_core as ic
6
+ import iscc_sdk as idk
7
+ import iscc_sci as sci
8
+ import iscc_schema as iss
9
+ from PIL import Image
10
+ import json
11
+
12
+ idk.sdk_opts.image_thumbnail_size = 240
13
+ idk.sdk_opts.image_thumbnail_quality = 80
14
+
15
+ custom_css = """
16
+ .fixed-height img {
17
+ height: 240px; /* Fixed height */
18
+ object-fit: contain; /* Scale the image to fit within the element */
19
+ }
20
+ """
21
+
22
+
23
+ def generate_iscc(file):
24
+ imeta = idk.code_iscc(file.name)
25
+ thumbnail = None
26
+ if imeta.thumbnail:
27
+ header, encoded = imeta.thumbnail.split(",", 1)
28
+ data = base64.b64decode(encoded)
29
+ thumbnail = Image.open(io.BytesIO(data))
30
+ metadata = imeta.dict(exclude_unset=False, by_alias=True)
31
+ if metadata.get("thumbnail"):
32
+ del metadata["thumbnail"]
33
+ return (
34
+ imeta.iscc,
35
+ thumbnail,
36
+ imeta.name,
37
+ imeta.description,
38
+ json.dumps(metadata, indent=2),
39
+ None,
40
+ )
41
+
42
+
43
+ with gr.Blocks(title="ISCC Generator", css=custom_css) as demo:
44
+ gr.Markdown("## ⚙️ ISCC Generator")
45
+ with gr.Row():
46
+ in_file = gr.File(label="Media File")
47
+ with gr.Row():
48
+ out_iscc = gr.Text(
49
+ label="ISCC",
50
+ info="GENERATED FROM MEDIA FILE",
51
+ show_copy_button=True,
52
+ show_label=True,
53
+ )
54
+ with gr.Row(variant="panel", equal_height=False):
55
+ with gr.Column():
56
+ out_thumbnail = gr.Image(
57
+ label="Extracted Thumbnail",
58
+ elem_classes=["fixed-height"],
59
+ height=240,
60
+ )
61
+ with gr.Column(scale=3):
62
+ with gr.Group():
63
+ out_name = gr.Text(label="Name", show_copy_button=True)
64
+ out_description = gr.Textbox(
65
+ label="Description", lines=4, max_lines=4, show_copy_button=True
66
+ )
67
+
68
+ with gr.Row():
69
+ with gr.Accordion(label="ISCC Metadata", open=False):
70
+ out_meta = gr.Code(language="json", label="JSON-LD")
71
+ in_file.upload(
72
+ generate_iscc,
73
+ inputs=[in_file],
74
+ outputs=[out_iscc, out_thumbnail, out_name, out_description, out_meta, in_file],
75
+ )
76
+
77
+ # Custom footer
78
+ footer = (
79
+ "https://github.com/iscc"
80
+ f" | iscc-core v{ic.__version__}"
81
+ f" | iscc-sdk v{idk.__version__}"
82
+ f" | iscc-sci v{sci.__version__}"
83
+ f" | iscc-schema v{iss.__version__}"
84
+ )
85
+ gr.Markdown(
86
+ footer,
87
+ )
88
+
89
+
90
+ if __name__ == "__main__":
91
+ demo.launch()
demos/images1/pope1.jpg ADDED
demos/images1/pope1b.jpg ADDED
demos/images1/ukbench00000.jpg ADDED
demos/images1/ukbench00016.jpg ADDED
demos/images1/ukbench00044.jpg ADDED
demos/images1/ukbench00052.jpg ADDED
demos/images2/pope2.jpg ADDED
demos/images2/pope2b.jpg ADDED
demos/images2/ukbench00002.jpg ADDED
demos/images2/ukbench00017.jpg ADDED
demos/images2/ukbench00046.jpg ADDED
demos/images2/ukbench00053.jpg ADDED
demos/inspect_.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from loguru import logger as log
3
+ import gradio as gr
4
+ import iscc_core as ic
5
+
6
+
7
+ def explain_iscc(code):
8
+ result = [gr.Column(visible=False), None, None, None, None]
9
+ if not code:
10
+ return tuple(result)
11
+ try:
12
+ canonical = ic.iscc_normalize(code)
13
+ # TODO Update iscc-core validation for MSCDI
14
+ # ic.iscc_validate(canonical, strict=True)
15
+ human = " - ".join(ic.iscc_explain(code).split("-"))
16
+ code_obj = ic.Code(canonical)
17
+ decomposed = " - ".join(ic.iscc_decompose(canonical))
18
+ multiformat = code_obj.mf_base58btc
19
+ except Exception as e:
20
+ log.error(e)
21
+ result[1] = str(e)
22
+ return tuple(result)
23
+ return gr.Column(visible=True), canonical, human, decomposed, multiformat
24
+
25
+
26
+ with gr.Blocks() as demo:
27
+ gr.Markdown(
28
+ """
29
+ ## 🕵️‍♂️ ISCC Inspector
30
+ """
31
+ )
32
+ with gr.Row():
33
+ with gr.Column():
34
+ in_iscc = gr.Text(
35
+ label="ISCC Inspector",
36
+ info="DECODE & EXPLAIN ISCC STRUCTURE",
37
+ placeholder="Paste an ISCC here to break it down",
38
+ autofocus=True,
39
+ )
40
+ examples = [
41
+ "ISCC:AAAWN77F727NXSUS", # Meta-Code
42
+ "bzqaqaal5rvp72lx2thvq", # Multiformat
43
+ "ISCC:EAASKDNZNYGUUF5A", # Text-Code
44
+ "ISCC:GABW5LUBVP23N3DOD7PPINHT5JKBI", # Data-Code 128 bits
45
+ "ISCC:KUAG5LUBVP23N3DOHCHWIYGXVN7ZS", # ISCC-SUM
46
+ "ISCC:KAA2Y5NUST7BFD5NN2XIDK7VW3WG4OEPMRQNPK37TE", # ISCC-CDI
47
+ "z36hVxiqoF8AAmDpZV958hn3tsv2i7v1NfCrSzpq", # ISCC-CDI multiformats
48
+ "ISCC:KACT4EBWK27737D2AYCJRAL5Z36G76RFRMO4554RU26HZ4ORJGIVHDI",
49
+ ]
50
+ gr.Examples(label="Example ISCCs", examples=examples, inputs=[in_iscc])
51
+
52
+ with gr.Row():
53
+ with gr.Column(visible=False) as out_column:
54
+ out_canonical = gr.Text(
55
+ label="Canonical",
56
+ info="NORMALIZED STANDARD REPRESENTATION",
57
+ show_copy_button=True,
58
+ value=None,
59
+ )
60
+ out_human = gr.Text(
61
+ label="Human Readable",
62
+ info="MAINTYPE - SUBTYPE - VERSION - LENGTH - BODY",
63
+ show_copy_button=True,
64
+ )
65
+ out_decomposed = gr.Text(
66
+ label="Decomposed",
67
+ info="ISCC-UNITS",
68
+ show_copy_button=True,
69
+ )
70
+ out_multiformat = gr.Text(
71
+ label="Multiformat",
72
+ info="BASE58-BTC",
73
+ show_copy_button=True,
74
+ )
75
+ in_iscc.change(
76
+ explain_iscc,
77
+ inputs=[in_iscc],
78
+ outputs=[
79
+ out_column,
80
+ out_canonical,
81
+ out_human,
82
+ out_decomposed,
83
+ out_multiformat,
84
+ ],
85
+ show_progress="hidden",
86
+ )
87
+
88
+ if __name__ == "__main__":
89
+ demo.launch()
demos/samples/sample.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # What is the ISCC
2
+
3
+ The ISCC is a similarity preserving fingerprint and identifier for digital media assets.
4
+
5
+ ISCCs are generated algorithmically from digital content, just like cryptographic hashes. However, instead of using a single cryptographic hash function to identify data only, the ISCC uses various algorithms to create a composite identifier that exhibits similarity-preserving properties (soft hash).
6
+
7
+ The component-based structure of the ISCC identifies content at multiple levels of abstraction. Each component is self-describing, modular, and can be used separately or with others to aid in various content identification tasks. The algorithmic design supports content deduplication, database synchronization, indexing, integrity verification, timestamping, versioning, data provenance, similarity clustering, anomaly detection, usage tracking, allocation of royalties, fact-checking and general digital asset management use-cases.
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -8,8 +8,10 @@ readme = "README.md"
8
 
9
  [tool.poetry.dependencies]
10
  python = "^3.9"
11
- gradio = "^4.12.0"
12
- iscc-sdk = "^0.6.0"
 
 
13
 
14
  [tool.poetry.group.dev.dependencies]
15
  black = "^23.12.1"
 
8
 
9
  [tool.poetry.dependencies]
10
  python = "^3.9"
11
+ gradio = "*"
12
+ iscc-sdk = "^0.6.1"
13
+ iscc-sci = "^0.1.0"
14
+ plotly = "^5.18.0"
15
 
16
  [tool.poetry.group.dev.dependencies]
17
  black = "^23.12.1"
requirements.txt CHANGED
@@ -1 +1,4 @@
1
- iscc-sdk==0.6.0
 
 
 
 
1
+ gradio==4.19.1
2
+ iscc-sdk==0.6.1
3
+ iscc-sci==0.1.0
4
+ plotly==5.19.0