import streamlit as st import json import pandas as pd from datasets import load_dataset st.set_page_config(page_title="The Stack data Inspection", layout="wide") st.sidebar.title("The Stack data Inspection") df = pd.read_csv("extension_distribution.csv") all_extensions = df["extension"].tolist() tags = {} for index, row in df.iterrows(): if row["language"] not in tags: tags[row["language"]] = [] tags[row["language"]].append(row["extension"]) all_languages = list(tags.keys()) @st.cache() def load_data(language, ext): ds = load_dataset( "loubnabnl/the-stack-inspection-data", data_dir=f"data/{language}/{ext}", split="train", ) return ds col1, col2, _ = st.sidebar.columns([1, 1, 4]) with col1: chosen_language = st.selectbox( label="Select a programming language", options=all_languages, index=0 ) with col2: chosen_ext = st.selectbox( label="Select an extension", options=tags[chosen_language], index=0 ) # load the dataset and get indexes of non lexable files samples = load_data(chosen_language, chosen_ext) max_docs = len(samples) samples = samples.add_column("idx", range(len(samples))) not_lexed = samples.filter(lambda x: not x["lexable"]) indexes_not_lexed = not_lexed["idx"] # info about extension st.sidebar.markdown("### Information about the extension:") text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \ {df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \ are not lexable.\n These files are at indexes:\n {indexes_not_lexed}." st.sidebar.markdown(text) col_1, _ = st.columns([2, 4]) with col_1: index_example = st.number_input( f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:", min_value=0, max_value=max_docs - 1, value=0, step=1, ) # info about the chosen example example = samples[index_example] st.markdown("#### Information about the chosen example:") text_alpha = "**has**" if example["long_lines"] else "doesn't have" text_lines = "**has**" if example["low_alphanum"] else "doesn't have" text_lexer = "is" if example["lexable"] else "**isn't**" st.markdown( f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \ {text_lines} very long lines, and {text_lexer} lexable." ) # display file content st.markdown("#### File content:") if not example["lexable"]: st.write(f"File can't be lexed so we remove syntax highlighting.\nContent:\n {example['content']}") else: st.code(example["content"], language=chosen_language)