File size: 3,271 Bytes
461c45d
 
 
2be75e8
 
 
7117f63
461c45d
2be75e8
 
66a3725
2be75e8
 
 
 
 
 
461c45d
 
a383930
461c45d
2be75e8
66a3725
 
 
 
 
2be75e8
 
66a3725
a383930
461c45d
a383930
66a3725
 
461c45d
a383930
66a3725
 
 
a383930
 
 
 
 
461c45d
66a3725
2be75e8
a383930
 
 
 
 
 
 
 
461c45d
2be75e8
461c45d
2be75e8
a383930
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232b43d
a383930
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import streamlit as st
import json
import pandas as pd
from datasets import load_dataset

st.set_page_config(page_title="The Stack data Inspection", layout="wide")
st.sidebar.title("The Stack data Inspection")

df = pd.read_csv("extension_distribution.csv")
all_extensions = df["extension"].tolist()

tags = {}
for index, row in df.iterrows():
    if row["language"] not in tags:
        tags[row["language"]] = []
    tags[row["language"]].append(row["extension"])
all_languages = list(tags.keys())



@st.cache()
def load_data(language, ext):
    ds = load_dataset(
        "loubnabnl/the-stack-inspection-data",
        data_dir=f"data/{language}/{ext}",
        split="train",
    )
    return ds


col1, col2, _ = st.columns([1, 1, 4])
with col1:
    chosen_language = st.sidebar.selectbox(
        label="Select a programming language", options=all_languages, index=0
    )
with col2:
    chosen_ext = st.sidebar.selectbox(
        label="Select an extension", options=tags[chosen_language], index=0
    )

st.sidebar.header("Filters")
not_lexable = st.sidebar.checkbox("Not lexable?")
low_alphanum = st.sidebar.checkbox("Low alphanum count?")
long_lines = st.sidebar.checkbox("Long lines?")


# load the dataset and get indexes of non lexable files
samples = load_data(chosen_language, chosen_ext)

if not_lexable:
    samples = samples.filter(lambda x: not x["lexable"])
if low_alphanum:
    samples = samples.filter(lambda x: x["low_alphanum"])
if long_lines:
    samples = samples.filter(lambda x: x["long_lines"])

max_docs = len(samples)
samples = samples.add_column("idx", range(len(samples)))

# info about extension
# st.sidebar.markdown("### Information about the extension:")
# text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \
# {df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \
# are not lexable.\n These files are at indexes:\n {indexes_not_lexed}."
# st.sidebar.markdown(text)

if max_docs > 0:
    col_1, _ = st.columns([3, 3])
    with col_1:
        index_example = st.number_input(
            f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
            min_value=0,
            max_value=max_docs - 1,
            value=0,
            step=1,
        )
    
    
    # info about the chosen example
    example = samples[index_example]
    
    # st.markdown("#### Information about the chosen example:")
    # text_alpha = "**has**" if example["long_lines"] else "doesn't have"
    # text_lines = "**has**" if example["low_alphanum"] else "doesn't have"
    # text_lexer = "is" if example["lexable"] else "**isn't**"
    
    # st.markdown(
    #     f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
    #     {text_lines} very long lines,  and {text_lexer} lexable."
    # )
    
    
    # display file content
    st.markdown("#### File content:")
    if not example["lexable"]:
        st.write(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
        st.text(example['content'])
    else:
        st.code(example["content"], language=chosen_language)