the-stack-inspection

Running

App Files Files Community

the-stack-inspection / app.py

lvwerra HF staff

Update app.py

98c3786 10 months ago

raw

history blame contribute delete

No virus

2.91 kB

	import streamlit as st
	import json
	import pandas as pd
	from datasets import load_dataset

	st.set_page_config(page_title="The Stack data Inspection", layout="wide")
	st.sidebar.title("The Stack data Inspection")

	df = pd.read_csv("new_extension_distribution.csv")
	all_extensions = df["extension"].tolist()

	tags = {}
	for index, row in df.iterrows():
	if row["language"] not in tags:
	tags[row["language"]] = []
	tags[row["language"]].append(str(row["extension"]))
	all_languages = list(tags.keys())


	@st.cache(max_entries=100)
	def load_data(language, ext, min_alphanum, max_line_length, max_mean_line_length, non_lexable):
	ext = None if ext == "nan" else ext
	samples = load_dataset(
	"loubnabnl/the-stack-inspection-data",
	data_dir=f"data/{language}/{ext}",
	split="train",
	)
	samples = samples.filter(
	lambda x: x["alphanum_fraction"] < min_alphanum
	and x["max_line_length"] > max_line_length
	and x["avg_line_length"] > max_mean_line_length
	)
	if non_lexable:
	samples = samples.filter(lambda x: not x["lexable"])
	return samples

	col1, col2, _ = st.columns([1, 1, 4])
	with col1:
	chosen_language = st.sidebar.selectbox(
	label="Select a programming language", options=all_languages, index=0
	)
	with col2:
	chosen_ext = st.sidebar.selectbox(
	label="Select an extension", options=tags[chosen_language], index=0
	)

	st.sidebar.header("Filters")
	not_lexable = st.sidebar.checkbox("Not lexable")
	min_alphanum = st.sidebar.slider("Minimum alphanumeric fraction", 0.0, 1.0, 1.0)
	max_line_length = st.sidebar.slider("Maximum line length", 0, 1200, 0, step=100)
	max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500, 0, step=100)
	st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\
	`alphanumeric_fraction` is smaller than the selected value.")

	samples = load_data(chosen_language, chosen_ext, min_alphanum, max_line_length, max_mean_line_length, not_lexable)

	max_docs = len(samples)

	if max_docs > 0:
	col_1, _ = st.columns([3, 3])
	with col_1:
	index_example = st.number_input(
	f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
	min_value=0,
	max_value=max_docs - 1,
	value=0,
	step=1,
	)

	example = samples[index_example]

	st.markdown("#### File content:")
	content = str(example["content"])

	if len(content)>10_000:
	content = example["content"][:10_000] + "\n[MORE CODE, DISPLAYING FIRST 10k CHARACTERS]"

	if example["lexable"]:
	st.code(content, language=chosen_language)
	else:
	st.text(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
	st.text(str(content))
	else:
	st.text("The dataset is empty after the filtering!")