Spaces:
Sleeping
Sleeping
File size: 3,271 Bytes
461c45d 2be75e8 7117f63 461c45d 2be75e8 66a3725 2be75e8 461c45d a383930 461c45d 2be75e8 66a3725 2be75e8 66a3725 a383930 461c45d a383930 66a3725 461c45d a383930 66a3725 a383930 461c45d 66a3725 2be75e8 a383930 461c45d 2be75e8 461c45d 2be75e8 a383930 232b43d a383930 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import streamlit as st
import json
import pandas as pd
from datasets import load_dataset
st.set_page_config(page_title="The Stack data Inspection", layout="wide")
st.sidebar.title("The Stack data Inspection")
df = pd.read_csv("extension_distribution.csv")
all_extensions = df["extension"].tolist()
tags = {}
for index, row in df.iterrows():
if row["language"] not in tags:
tags[row["language"]] = []
tags[row["language"]].append(row["extension"])
all_languages = list(tags.keys())
@st.cache()
def load_data(language, ext):
ds = load_dataset(
"loubnabnl/the-stack-inspection-data",
data_dir=f"data/{language}/{ext}",
split="train",
)
return ds
col1, col2, _ = st.columns([1, 1, 4])
with col1:
chosen_language = st.sidebar.selectbox(
label="Select a programming language", options=all_languages, index=0
)
with col2:
chosen_ext = st.sidebar.selectbox(
label="Select an extension", options=tags[chosen_language], index=0
)
st.sidebar.header("Filters")
not_lexable = st.sidebar.checkbox("Not lexable?")
low_alphanum = st.sidebar.checkbox("Low alphanum count?")
long_lines = st.sidebar.checkbox("Long lines?")
# load the dataset and get indexes of non lexable files
samples = load_data(chosen_language, chosen_ext)
if not_lexable:
samples = samples.filter(lambda x: not x["lexable"])
if low_alphanum:
samples = samples.filter(lambda x: x["low_alphanum"])
if long_lines:
samples = samples.filter(lambda x: x["long_lines"])
max_docs = len(samples)
samples = samples.add_column("idx", range(len(samples)))
# info about extension
# st.sidebar.markdown("### Information about the extension:")
# text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \
# {df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \
# are not lexable.\n These files are at indexes:\n {indexes_not_lexed}."
# st.sidebar.markdown(text)
if max_docs > 0:
col_1, _ = st.columns([3, 3])
with col_1:
index_example = st.number_input(
f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
min_value=0,
max_value=max_docs - 1,
value=0,
step=1,
)
# info about the chosen example
example = samples[index_example]
# st.markdown("#### Information about the chosen example:")
# text_alpha = "**has**" if example["long_lines"] else "doesn't have"
# text_lines = "**has**" if example["low_alphanum"] else "doesn't have"
# text_lexer = "is" if example["lexable"] else "**isn't**"
# st.markdown(
# f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
# {text_lines} very long lines, and {text_lexer} lexable."
# )
# display file content
st.markdown("#### File content:")
if not example["lexable"]:
st.write(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
st.text(example['content'])
else:
st.code(example["content"], language=chosen_language) |