File size: 2,592 Bytes
461c45d
 
 
2be75e8
 
 
7117f63
461c45d
8e38fa9
2be75e8
66a3725
2be75e8
 
 
 
 
 
461c45d
 
28f08c2
353f3d1
2be75e8
66a3725
 
 
 
 
2be75e8
 
28f08c2
461c45d
28f08c2
66a3725
 
461c45d
28f08c2
66a3725
 
 
28f08c2
41e4b90
e6e88b9
c6da86c
353f3d1
41e4b90
 
461c45d
34c0fa2
2be75e8
28f08c2
353f3d1
 
 
 
28f08c2
 
 
461c45d
28f08c2
 
 
 
 
 
 
 
 
 
 
41e4b90
28f08c2
41e4b90
28f08c2
41e4b90
 
28f08c2
41e4b90
8e7670b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
import json
import pandas as pd
from datasets import load_dataset

st.set_page_config(page_title="The Stack data Inspection", layout="wide")
st.sidebar.title("The Stack data Inspection")

df = pd.read_csv("new_extension_distribution.csv")
all_extensions = df["extension"].tolist()

tags = {}
for index, row in df.iterrows():
    if row["language"] not in tags:
        tags[row["language"]] = []
    tags[row["language"]].append(row["extension"])
all_languages = list(tags.keys())



@st.cache_data()
def load_data(language, ext):
    ds = load_dataset(
        "loubnabnl/the-stack-inspection-data",
        data_dir=f"data/{language}/{ext}",
        split="train",
    )
    return ds

col1, col2, _ = st.columns([1, 1, 4])
with col1:
    chosen_language = st.sidebar.selectbox(
        label="Select a programming language", options=all_languages, index=0
    )
with col2:
    chosen_ext = st.sidebar.selectbox(
        label="Select an extension", options=tags[chosen_language], index=0
    )

st.sidebar.header("Filters")
not_lexable = st.sidebar.checkbox("Not lexable")
min_alphanum = st.sidebar.slider("Minimum alphanumeric fraction", 0.0, 1.0, 1.0)
max_line_length = st.sidebar.slider("Maximum line length", 0, 1000, 0)
max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500, 0)
st.sidebar.markdown("Printed files have `max_line_length`  and `average_line_length` larger than the selected values.\
`alphanumeric_fraction` is smaller than the selected value.")

# load and filter dataset
samples = load_data(chosen_language, chosen_ext)

samples = samples.filter(lambda x: x["alphanum_fraction"] < min_alphanum)
samples = samples.filter(lambda x: x["max_line_length"] > max_line_length)
samples = samples.filter(lambda x: x["avg_line_length"] > max_mean_line_length)

if not_lexable:
    samples = samples.filter(lambda x: not x["lexable"])

max_docs = len(samples)

if max_docs > 0:
    col_1, _ = st.columns([3, 3])
    with col_1:
        index_example = st.number_input(
            f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
            min_value=0,
            max_value=max_docs - 1,
            value=0,
            step=1,
        )

    example = samples[index_example]

    st.markdown("#### File content:")
    if example["lexable"]:
        st.code(example["content"], language=chosen_language)
    else:
        st.text(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
        st.text(str(example["content"]))
else:
    st.text("The dataset is empty after the filtering!")