Rahkakavee Baskaran commited on
Commit
ea3bd45
1 Parent(s): 8a6a919

add dataset sizes to treemap and description

Browse files
Files changed (3) hide show
  1. app.py +95 -45
  2. poetry.lock +0 -0
  3. pyproject.toml +25 -0
app.py CHANGED
@@ -1,21 +1,16 @@
1
  from collections import Counter
 
2
  import streamlit as st
3
  import json
4
- from itertools import islice
5
- from typing import Generator
6
  from plotly import express as px
7
  from safetensors import safe_open
8
  from semantic_search import predict
9
  from sentence_transformers import SentenceTransformer
10
  import os
 
11
 
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
13
 
14
- def chunks(data: dict, size=13) -> Generator:
15
- it = iter(data)
16
- for i in range(0, len(data), size):
17
- yield {k: data[k] for k in islice(it, size)}
18
-
19
 
20
  def get_tree_map_data(
21
  data: dict,
@@ -28,27 +23,72 @@ def get_tree_map_data(
28
  values: list = ["0"]
29
 
30
  for group, labels in data.items():
31
- names.append(group)
32
  parents.append(root)
33
  if group in countings_parents:
34
  values.append(str(countings_parents[group]))
 
 
 
 
 
 
 
 
35
  else:
36
  values.append("0")
 
 
37
  for label in labels:
38
  if "-" in label:
39
  label = label.split("-")
40
  label = label[0] + "<br> -" + label[1]
41
- names.append(label)
42
- parents.append(group)
43
  if label in countings_labels:
 
 
 
 
 
 
 
 
 
 
 
44
  values.append(str(countings_labels[label]))
45
- else:
46
- values.append("0")
47
- # if "-" in label:
48
- # names.append(label.split("-")[0])
49
- # parents.append(label)
50
- # names.append(label.split("-")[1])
51
- # parents.append(label)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  return parents, names, values
53
 
54
 
@@ -76,24 +116,28 @@ for el in taxonomy:
76
  else:
77
  taxonomy_group_label_mapper[el["group"]].append("Sonstiges ")
78
 
79
- parents, name, values = get_tree_map_data(
 
 
80
  data=taxonomy_group_label_mapper,
81
  countings_parents=theme_counts,
82
  countings_labels=labels_counts,
83
  root="Musterdatenkatalog",
84
  )
85
 
86
- fig = px.treemap(
87
- names=name,
88
- parents=parents,
89
- )
90
 
91
- fig.update_layout(
92
- margin=dict(t=50, l=25, r=25, b=25),
93
- height=1000,
94
- width=1000,
95
- template="plotly",
 
96
  )
 
 
97
 
98
 
99
  tensors = {}
@@ -112,6 +156,29 @@ st.set_page_config(layout="wide")
112
 
113
  st.title("Musterdatenkatalog")
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  col1, col2, col3 = st.columns(3)
116
  col1.metric("Datensätze", len(data))
117
  col2.metric("Themen", len(theme_counts))
@@ -123,23 +190,6 @@ st.plotly_chart(fig)
123
 
124
  st.title("Predict a Dataset")
125
 
126
- # create two columns and make left column wider
127
-
128
- # st.markdown(
129
- # """
130
- # <style>
131
- # div[data-testid="stVerticalBlock"] div[style*="flex-direction: column;"] div[data-testid="stVerticalBlock"] {
132
- # border-radius: 15px;
133
- # background-color: white;
134
- # box-shadow: 0 0 10px #eee;
135
- # border: 1px solid #ddd;
136
- # padding: 1rem;;
137
- # }
138
- # </style>
139
- # """,
140
- # unsafe_allow_html=True,
141
- # )
142
-
143
  st.markdown(
144
  """
145
  <style>
@@ -170,7 +220,7 @@ col1, col2 = st.columns([1.2, 1])
170
 
171
 
172
  with col2:
173
- st.subheader("Example Datasets")
174
  examples = [
175
  "Spielplätze",
176
  "Berliner Weihnachtsmärkte 2022",
 
1
  from collections import Counter
2
+ import pandas as pd
3
  import streamlit as st
4
  import json
 
 
5
  from plotly import express as px
6
  from safetensors import safe_open
7
  from semantic_search import predict
8
  from sentence_transformers import SentenceTransformer
9
  import os
10
+ import plotly.graph_objects as go
11
 
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
13
 
 
 
 
 
 
14
 
15
  def get_tree_map_data(
16
  data: dict,
 
23
  values: list = ["0"]
24
 
25
  for group, labels in data.items():
 
26
  parents.append(root)
27
  if group in countings_parents:
28
  values.append(str(countings_parents[group]))
29
+ group_name_with_count = (
30
+ group
31
+ + "<br>"
32
+ + "Anzahl Datensätze:"
33
+ + " "
34
+ + str(countings_parents[group])
35
+ )
36
+ names.append(group_name_with_count)
37
  else:
38
  values.append("0")
39
+ group_name_with_count = group + "<br>" + "Anzahl Datensätze:" + " " + "0"
40
+ names.append(group_name_with_count)
41
  for label in labels:
42
  if "-" in label:
43
  label = label.split("-")
44
  label = label[0] + "<br> -" + label[1]
 
 
45
  if label in countings_labels:
46
+ label_name_with_count = (
47
+ label
48
+ + "<br>"
49
+ + "<br>"
50
+ + "Anzahl Datensätze:"
51
+ + "<br>"
52
+ + ""
53
+ + str(countings_labels[label])
54
+ )
55
+ names.append(label_name_with_count)
56
+ parents.append(group_name_with_count)
57
  values.append(str(countings_labels[label]))
58
+ if label not in countings_labels:
59
+ if "<br>" in label:
60
+ if (
61
+ label.split("<br>")[0].strip() + label.split("<br>")[-1]
62
+ in countings_labels
63
+ ):
64
+ label_name_with_count = (
65
+ label
66
+ + "<br>"
67
+ + "<br>"
68
+ + "Anzahl Datensätze:"
69
+ + "<br>"
70
+ + ""
71
+ + str(
72
+ countings_labels[
73
+ label.split("<br>")[0].strip()
74
+ + label.split("<br>")[-1]
75
+ ]
76
+ )
77
+ )
78
+ else:
79
+ print(label)
80
+ label_name_with_count = (
81
+ label
82
+ + "<br>"
83
+ + "<br>"
84
+ + "Anzahl Datensätze:"
85
+ + "<br>"
86
+ + ""
87
+ + "0"
88
+ )
89
+ names.append(label_name_with_count)
90
+ parents.append(group_name_with_count)
91
+ values.append("0")
92
  return parents, names, values
93
 
94
 
 
116
  else:
117
  taxonomy_group_label_mapper[el["group"]].append("Sonstiges ")
118
 
119
+ del taxonomy_group_label_mapper["Sonstiges"]
120
+
121
+ parents, names, values = get_tree_map_data(
122
  data=taxonomy_group_label_mapper,
123
  countings_parents=theme_counts,
124
  countings_labels=labels_counts,
125
  root="Musterdatenkatalog",
126
  )
127
 
128
+ df = pd.DataFrame(data={"thema": parents, "bezeichnung": names, "value": values})
129
+ df["value"] = df["value"].astype(str)
130
+ df["bezeichnung"] = df["bezeichnung"]
 
131
 
132
+ fig = go.Figure(
133
+ go.Treemap(
134
+ labels=df["bezeichnung"],
135
+ parents=df["thema"],
136
+ textinfo="label",
137
+ )
138
  )
139
+ fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
140
+ fig.update_layout(height=1000, width=1000, template="plotly")
141
 
142
 
143
  tensors = {}
 
156
 
157
  st.title("Musterdatenkatalog")
158
 
159
+ st.markdown(
160
+ """
161
+ <style>
162
+ .font {
163
+ font-size:20px !important;
164
+ }
165
+ </style>
166
+ """,
167
+ unsafe_allow_html=True,
168
+ )
169
+
170
+ st.markdown(
171
+ '<p class="font">This demo showcases the algorithm of Musterdatenkatalog (MDK) of the Bertelsmann Stiftung. The MDK is a taxonomy of Open Data in municipalities in Germany. It is intended to help municipalities in Germany, as well as data analysts and journalists, to get an overview of the topics and the extent to which cities have already published data sets.</p>',
172
+ unsafe_allow_html=True,
173
+ )
174
+
175
+
176
+ st.markdown(
177
+ '<p class="font"> For more details checkout the <a href=https://www.bertelsmann-stiftung.de/de/unsere-projekte/smart-country/musterdatenkatalog> Musterdatenkatalog.</p>',
178
+ unsafe_allow_html=True,
179
+ )
180
+
181
+
182
  col1, col2, col3 = st.columns(3)
183
  col1.metric("Datensätze", len(data))
184
  col2.metric("Themen", len(theme_counts))
 
190
 
191
  st.title("Predict a Dataset")
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  st.markdown(
194
  """
195
  <style>
 
220
 
221
 
222
  with col2:
223
+ st.subheader("Example Input Dataset Names")
224
  examples = [
225
  "Spielplätze",
226
  "Berliner Weihnachtsmärkte 2022",
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "musterdatenkatalog-space-app"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Rahkakavee Baskaran <rahkakavee.baskaran@cause-effect.io>"]
6
+ readme = "README.md"
7
+ packages = [{include = "musterdatenkatalog_space_app"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.11"
11
+ streamlit = "^1.22.0"
12
+ plotly-express = "^0.4.1"
13
+ plotly = "^5.14.1"
14
+ httpx = "^0.24.0"
15
+ sentence-transformers = "^2.2.2"
16
+ safetensors = "^0.3.1"
17
+
18
+
19
+ [tool.poetry.group.dev.dependencies]
20
+ black = "^23.3.0"
21
+ mypy = "^1.3.0"
22
+
23
+ [build-system]
24
+ requires = ["poetry-core"]
25
+ build-backend = "poetry.core.masonry.api"