Szymon Woźniak commited on
Commit
cd37af8
1 Parent(s): abc36fe

add dataset statistics

Browse files
data/datasets_stats.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8392af92da4f335b3c7319c662e8416cf2202621c67672bb3d09644192226dff
3
+ size 37122
pages/3_Dataset_Statistics.py CHANGED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from filter_dataframe import filter_dataframe
4
+
5
+
6
+ @st.cache_data
7
+ def get_language_stats_df():
8
+ return pd.read_parquet("data/datasets_stats.parquet")
9
+
10
+ def export_citations(df: pd.DataFrame) -> str:
11
+ return "\n\n".join(df["citation"].tolist())
12
+
13
+ _MMS_CITATION = """\
14
+ #TODO: Add MMS citation
15
+ """
16
+
17
+ CITATION_SEPARATOR = "\n% " + ("-" * 90) + "\n"
18
+
19
+ def export_citations(df: pd.DataFrame):
20
+ dataset_names = df.original_dataset.tolist()
21
+ dataset_citations = df.citation.tolist()
22
+
23
+ df = pd.DataFrame({"dataset": dataset_names, "citation": dataset_citations})
24
+ citations_grouped_df = df.groupby("citation").agg({"dataset": lambda x: ", ".join(x)}).reset_index().sort_values(by="dataset")
25
+ dataset_citations = ("% Datasets: " + citations_grouped_df["dataset"] + "\n" + citations_grouped_df["citation"]).to_list()
26
+ dataset_citations_joined = CITATION_SEPARATOR.join(dataset_citations)
27
+ return f"{_MMS_CITATION}\n\n{dataset_citations_joined}"
28
+
29
+
30
+ st.set_page_config(page_title="Dataset statistics", page_icon="📈")
31
+
32
+ st.markdown("# Dataset statistics")
33
+ st.write(
34
+ """TODO: Description"""
35
+ )
36
+
37
+ df = get_language_stats_df()
38
+
39
+ df_filter = filter_dataframe(df)
40
+ st.dataframe(df_filter)
41
+
42
+
43
+ if st.button("Export citations"):
44
+ print("BLEEEEE")
45
+ print(export_citations(df_filter))
46
+ val = export_citations(df_filter)
47
+ st.code(val)