Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 1,957 Bytes
53ffb10 4a852e1 53ffb10 963c572 53ffb10 963c572 53ffb10 963c572 53ffb10 963c572 53ffb10 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import json
import streamlit as st
from datasets import load_from_disk
class Visualization:
def __init__(self, path_web_documents_dataset):
self.path_web_documents_dataset = path_web_documents_dataset
def visualization(self):
self.set_title()
self.load_dataset()
self.choose_document()
self.display_document()
def set_title(self):
st.title("Visualization of OBELICS web documents")
def load_dataset(self):
self.dataset = load_from_disk(self.path_web_documents_dataset)
def choose_document(self):
st.header("Choose a document")
idx = st.number_input(
f"Select a document among the first {self.dataset.num_rows} ones",
min_value=0,
max_value=self.dataset.num_rows - 1,
value=0,
step=1,
help=f"Index between 0 and {self.dataset.num_rows-1}",
)
self.current_doc = self.dataset[idx]
def display_document(self):
st.header("Document")
texts = self.current_doc["texts"]
images = self.current_doc["images"]
metadata = json.loads(self.current_doc["metadata"])
for text, image, meta in zip(texts, images, metadata):
if text:
display_text = f"{text}\n".replace("\n", "<br>") # .replace(" ", " ") Preserves white spaces, but creates text outside the width of the window
st.markdown(f"<pre>{display_text}</pre>", unsafe_allow_html=True)
elif image:
st.markdown(f'<img src="{meta["src"]}" style="max-width: 1000px; height: auto;" />', unsafe_allow_html=True)
st.text("\n")
if __name__ == "__main__":
st.set_page_config(layout="wide")
path_web_documents_dataset = "./web_docs_final_replaceimgbyurl"
visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
visualization.visualization()
|