Spaces:
Sleeping
Sleeping
File size: 2,592 Bytes
461c45d 2be75e8 7117f63 461c45d 8e38fa9 2be75e8 66a3725 2be75e8 461c45d 28f08c2 353f3d1 2be75e8 66a3725 2be75e8 28f08c2 461c45d 28f08c2 66a3725 461c45d 28f08c2 66a3725 28f08c2 41e4b90 e6e88b9 c6da86c 353f3d1 41e4b90 461c45d 34c0fa2 2be75e8 28f08c2 353f3d1 28f08c2 461c45d 28f08c2 41e4b90 28f08c2 41e4b90 28f08c2 41e4b90 28f08c2 41e4b90 8e7670b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import streamlit as st
import json
import pandas as pd
from datasets import load_dataset
st.set_page_config(page_title="The Stack data Inspection", layout="wide")
st.sidebar.title("The Stack data Inspection")
df = pd.read_csv("new_extension_distribution.csv")
all_extensions = df["extension"].tolist()
tags = {}
for index, row in df.iterrows():
if row["language"] not in tags:
tags[row["language"]] = []
tags[row["language"]].append(row["extension"])
all_languages = list(tags.keys())
@st.cache_data()
def load_data(language, ext):
ds = load_dataset(
"loubnabnl/the-stack-inspection-data",
data_dir=f"data/{language}/{ext}",
split="train",
)
return ds
col1, col2, _ = st.columns([1, 1, 4])
with col1:
chosen_language = st.sidebar.selectbox(
label="Select a programming language", options=all_languages, index=0
)
with col2:
chosen_ext = st.sidebar.selectbox(
label="Select an extension", options=tags[chosen_language], index=0
)
st.sidebar.header("Filters")
not_lexable = st.sidebar.checkbox("Not lexable")
min_alphanum = st.sidebar.slider("Minimum alphanumeric fraction", 0.0, 1.0, 1.0)
max_line_length = st.sidebar.slider("Maximum line length", 0, 1000, 0)
max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500, 0)
st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\
`alphanumeric_fraction` is smaller than the selected value.")
# load and filter dataset
samples = load_data(chosen_language, chosen_ext)
samples = samples.filter(lambda x: x["alphanum_fraction"] < min_alphanum)
samples = samples.filter(lambda x: x["max_line_length"] > max_line_length)
samples = samples.filter(lambda x: x["avg_line_length"] > max_mean_line_length)
if not_lexable:
samples = samples.filter(lambda x: not x["lexable"])
max_docs = len(samples)
if max_docs > 0:
col_1, _ = st.columns([3, 3])
with col_1:
index_example = st.number_input(
f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
min_value=0,
max_value=max_docs - 1,
value=0,
step=1,
)
example = samples[index_example]
st.markdown("#### File content:")
if example["lexable"]:
st.code(example["content"], language=chosen_language)
else:
st.text(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
st.text(str(example["content"]))
else:
st.text("The dataset is empty after the filtering!") |