the-stack-inspection

Running

File size: 3,271 Bytes

import streamlit as st
import json
import pandas as pd
from datasets import load_dataset

st.set_page_config(page_title="The Stack data Inspection", layout="wide")
st.sidebar.title("The Stack data Inspection")

df = pd.read_csv("extension_distribution.csv")
all_extensions = df["extension"].tolist()

tags = {}
for index, row in df.iterrows():
    if row["language"] not in tags:
        tags[row["language"]] = []
    tags[row["language"]].append(row["extension"])
all_languages = list(tags.keys())



@st.cache()
def load_data(language, ext):
    ds = load_dataset(
        "loubnabnl/the-stack-inspection-data",
        data_dir=f"data/{language}/{ext}",
        split="train",
    )
    return ds


col1, col2, _ = st.columns([1, 1, 4])
with col1:
    chosen_language = st.sidebar.selectbox(
        label="Select a programming language", options=all_languages, index=0
    )
with col2:
    chosen_ext = st.sidebar.selectbox(
        label="Select an extension", options=tags[chosen_language], index=0
    )

st.sidebar.header("Filters")
not_lexable = st.sidebar.checkbox("Not lexable?")
low_alphanum = st.sidebar.checkbox("Low alphanum count?")
long_lines = st.sidebar.checkbox("Long lines?")


# load the dataset and get indexes of non lexable files
samples = load_data(chosen_language, chosen_ext)

if not_lexable:
    samples = samples.filter(lambda x: not x["lexable"])
if low_alphanum:
    samples = samples.filter(lambda x: x["low_alphanum"])
if long_lines:
    samples = samples.filter(lambda x: x["long_lines"])

max_docs = len(samples)
samples = samples.add_column("idx", range(len(samples)))

# info about extension
# st.sidebar.markdown("### Information about the extension:")
# text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \
# {df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \
# are not lexable.\n These files are at indexes:\n {indexes_not_lexed}."
# st.sidebar.markdown(text)

if max_docs > 0:
    col_1, _ = st.columns([3, 3])
    with col_1:
        index_example = st.number_input(
            f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
            min_value=0,
            max_value=max_docs - 1,
            value=0,
            step=1,
        )
    
    
    # info about the chosen example
    example = samples[index_example]
    
    # st.markdown("#### Information about the chosen example:")
    # text_alpha = "**has**" if example["long_lines"] else "doesn't have"
    # text_lines = "**has**" if example["low_alphanum"] else "doesn't have"
    # text_lexer = "is" if example["lexable"] else "**isn't**"
    
    # st.markdown(
    #     f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
    #     {text_lines} very long lines,  and {text_lexer} lexable."
    # )
    
    
    # display file content
    st.markdown("#### File content:")
    if not example["lexable"]:
        st.write(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
        st.text(example['content'])
    else:
        st.code(example["content"], language=chosen_language)