Aziz Alto commited on
Commit
87969f7
β€’
1 Parent(s): f4fd12c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +342 -0
app.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import streamlit_ace as stace
4
+ import duckdb
5
+ import numpy as np # for user session
6
+ import scipy # for user session
7
+ import plotly.express as px # for user session
8
+ import plotly.figure_factory as ff # for user session
9
+ import matplotlib.pyplot as plt # for user session
10
+ import sklearn
11
+ from ydata_profiling import ProfileReport
12
+ from streamlit_pandas_profiling import st_profile_report
13
+
14
+ st.set_page_config(page_title="PySQLify", page_icon="πŸ”Ž", layout="wide")
15
+ st.title("PySQLify")
16
+ st.write("_Data Analysis_ Tool")
17
+
18
+
19
+ @st.cache_data
20
+ def _read_csv(f, **kwargs):
21
+ df = pd.read_csv(f, on_bad_lines="skip", **kwargs)
22
+ # clean
23
+ df.columns = [c.strip() for c in df.columns]
24
+ return df
25
+
26
+
27
+ SAMPLE_DATA = {
28
+ "Churn dataset": "https://raw.githubusercontent.com/AtashfarazNavid/MachineLearing-ChurnModeling/main/Streamlit-WebApp-1/Churn.csv",
29
+ "Periodic Table": "https://gist.githubusercontent.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee/raw/1d92663004489a5b6926e944c1b3d9ec5c40900e/Periodic%2520Table%2520of%2520Elements.csv",
30
+ "Movies": "https://raw.githubusercontent.com/reisanar/datasets/master/HollywoodMovies.csv",
31
+ "Iris Flower": "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv",
32
+ "World Population": "https://gist.githubusercontent.com/curran/13d30e855d48cdd6f22acdf0afe27286/raw/0635f14817ec634833bb904a47594cc2f5f9dbf8/worldcities_clean.csv",
33
+ "Country Table": "https://raw.githubusercontent.com/datasciencedojo/datasets/master/WorldDBTables/CountryTable.csv",
34
+ "World Cities": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/cities.csv",
35
+ "World States": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/states.csv",
36
+ "World Countries": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/countries.csv"
37
+ }
38
+
39
+
40
+ def read_data():
41
+ txt = "Upload a data file (supported files: .csv)"
42
+ placeholder = st.empty()
43
+ with placeholder:
44
+ col1, col2, col3 = st.columns([3, 2, 1])
45
+ with col1:
46
+ file_ = st.file_uploader(txt, help="TODO: .tsv, .xls, .xlsx")
47
+ with col2:
48
+ url = st.text_input(
49
+ "Read from a URL",
50
+ placeholder="Enter URL (supported types: .csv and .tsv)",
51
+ )
52
+ if url:
53
+ file_ = url
54
+ with col3:
55
+ selected = st.selectbox("Select a sample dataset", options=[""] + list(SAMPLE_DATA))
56
+ if selected:
57
+ file_ = SAMPLE_DATA[selected]
58
+
59
+ if not file_:
60
+ st.stop()
61
+
62
+ placeholder.empty()
63
+ kwargs = {"skiprows": st.number_input("skip header", value=0, max_value=10)}
64
+ try:
65
+ return _read_csv(file_, **kwargs)
66
+ except Exception as e:
67
+ st.warning("Unsupported file type!")
68
+ st.stop()
69
+
70
+
71
+ def display(df):
72
+ view_info = st.checkbox("view data types")
73
+ st.dataframe(df, use_container_width=True)
74
+
75
+ # info
76
+ st.markdown(f"> <sup>shape `{df.shape}`</sup>", unsafe_allow_html=True)
77
+
78
+ if view_info:
79
+ types_ = df.dtypes.to_dict()
80
+ types_ = [{"Column": c, "Type": t} for c, t in types_.items()]
81
+ df_ = pd.DataFrame(types_)
82
+ st.sidebar.subheader("TABLE DETAILS")
83
+ st.sidebar.write(df_)
84
+
85
+
86
+ def code_editor(language, hint, show_panel, key=None):
87
+ # Spawn a new Ace editor
88
+ placeholder = st.empty()
89
+
90
+ default_theme = "solarized_dark" if language == "sql" else "chrome"
91
+
92
+ with placeholder.expander("CELL CONFIG"):
93
+ # configs
94
+ _THEMES = stace.THEMES
95
+ _KEYBINDINGS = stace.KEYBINDINGS
96
+ col21, col22 = st.columns(2)
97
+ with col21:
98
+ theme = st.selectbox("Theme", options=[default_theme] + _THEMES, key=f"{language}1{key}")
99
+ tab_size = st.slider("Tab size", min_value=1, max_value=8, value=4, key=f"{language}2{key}")
100
+ with col22:
101
+ keybinding = st.selectbox("Keybinding", options=[_KEYBINDINGS[-2]] + _KEYBINDINGS, key=f"{language}3{key}")
102
+ font_size = st.slider("Font size", min_value=5, max_value=24, value=14, key=f"{language}4{key}")
103
+ height = st.slider("Editor height", value=230, max_value=777,key=f"{language}5{key}")
104
+ # kwargs = {theme: theme, keybinding: keybinding} # TODO: DRY
105
+ if not show_panel:
106
+ placeholder.empty()
107
+
108
+ content = stace.st_ace(
109
+ language=language,
110
+ height=height,
111
+ show_gutter=False,
112
+ # annotations="",
113
+ placeholder=hint,
114
+ keybinding=keybinding,
115
+ theme=theme,
116
+ font_size=font_size,
117
+ tab_size=tab_size,
118
+ key=key
119
+ )
120
+
121
+ # Display editor's content as you type
122
+ # content
123
+ return content
124
+
125
+
126
+ @st.cache_data
127
+ def query_data(sql, df):
128
+ try:
129
+ return duckdb.query(sql).df()
130
+ except Exception as e:
131
+ st.warning("Invalid Query!")
132
+ # st.stop()
133
+
134
+
135
+ def download(df, key, save_as="results.csv"):
136
+ # -- to download
137
+ # @st.cache_data
138
+ def convert_df(_df):
139
+ return _df.to_csv().encode("utf-8")
140
+
141
+ csv = convert_df(df)
142
+ st.download_button(
143
+ "Download",
144
+ csv,
145
+ save_as,
146
+ "text/csv",
147
+ key=key
148
+ )
149
+
150
+
151
+ def display_results(query: str, result: pd.DataFrame, key: str):
152
+ st.dataframe(result, use_container_width=True)
153
+ st.markdown(f"> `{result.shape}`")
154
+ download(result, key=key)
155
+
156
+
157
+ def run_python_script(user_script, key):
158
+ if user_script.startswith("st.") or ";" in user_script:
159
+ py = user_script
160
+ elif user_script.endswith("?"): # -- same as ? in Jupyter Notebook
161
+ in_ = user_script.replace("?", "")
162
+ py = f"st.help({in_})"
163
+ else:
164
+ py = f"st.write({user_script})"
165
+ try:
166
+ cmds = py.split(";")
167
+ for cmd in cmds:
168
+ exec(cmd)
169
+ except Exception as e:
170
+ c1, c2 = st.columns(2)
171
+ c1.warning("Wrong Python command.")
172
+ if c2.button("Show error", key=key):
173
+ st.exception(e)
174
+
175
+
176
+ @st.cache_resource
177
+ def data_profiler(df):
178
+ return ProfileReport(df, title="Profiling Report")
179
+
180
+
181
+ def docs():
182
+ content = """
183
+
184
+ # What
185
+
186
+ Upload a dataset to process (manipulate/analyze) it using SQL and Python, similar to running Jupyter Notebooks.
187
+ To get started, drag and drop the dataset file, read from a URL, or select a sample dataset. To load a new dataset, refresh the webpage.
188
+ > <sub>[_src code_ here](https://github.com/iamaziz/sqlify)</sub>
189
+
190
+ More public datasets available [here](https://github.com/fivethirtyeight/data).
191
+
192
+ # Usage
193
+
194
+ Example usage
195
+
196
+ > After loading the sample Iris dataset from sklearn (or select it from the dropdown list), the lines below can be executed inside a Python cell:
197
+
198
+ ```python
199
+
200
+ from sklearn.datasets import load_iris;
201
+ from sklearn import tree;
202
+ iris = load_iris();
203
+ X, y = iris.data, iris.target;
204
+ clf = tree.DecisionTreeClassifier(max_depth=4);
205
+ clf = clf.fit(X, y);
206
+ plt.figure(figsize=(7,3));
207
+ fig, ax = plt.subplots()
208
+ tree.plot_tree(clf, filled=True, fontsize=4);
209
+ st.pyplot(fig)
210
+ ```
211
+
212
+ Which outputs the tree below:
213
+
214
+ > <img width="1000" alt="image" src="https://user-images.githubusercontent.com/3298308/222992623-1dba9bad-4858-43b6-84bf-9d7cf78d61f7.png">
215
+
216
+ # SCREENSHOTS
217
+
218
+ ## _EXAMPLE 1_
219
+ ![image](https://user-images.githubusercontent.com/3298308/222946054-a92ea42c-ffe6-4958-900b-2b72056216f8.png)
220
+
221
+ ## _EXAMPLE 2_
222
+ ![image](https://user-images.githubusercontent.com/3298308/222947315-f2c06063-dd18-4215-bbab-c1b2f3f00888.png)
223
+ ![image](https://user-images.githubusercontent.com/3298308/222947321-c7e38d9d-7274-4368-91c1-1548b0da14dc.png)
224
+
225
+ ## _EXAMPLE 3_
226
+ ![image](https://user-images.githubusercontent.com/3298308/222949287-2024a75f-04db-4861-93b5-c43d206e2dc6.png)
227
+
228
+ ## _EXAMPLE 4_
229
+ ![image](https://user-images.githubusercontent.com/3298308/222984104-0bfd806f-ecd9-455e-b368-181f9aa0225b.png)
230
+
231
+ """
232
+
233
+ with st.expander("READE"):
234
+ st.markdown(content, unsafe_allow_html=True)
235
+
236
+ return st.checkbox("Show more code examples")
237
+
238
+
239
+ def display_example_snippets():
240
+ from glob import glob
241
+
242
+ examples = glob("./examples/*")
243
+ with st.expander("EXAMPLES"):
244
+ example = st.selectbox("", options=[""] + examples)
245
+ if example:
246
+ with open(example, "r") as f:
247
+ content = f.read()
248
+ st.code(content)
249
+
250
+
251
+ if __name__ == "__main__":
252
+ show_examples = docs()
253
+ if show_examples:
254
+ display_example_snippets()
255
+
256
+ df = read_data()
257
+ display(df)
258
+
259
+ # run and execute SQL script
260
+ def sql_cells(df):
261
+ st.write("---")
262
+ st.header("SQL")
263
+ hint = """Type SQL to query the loaded dataset, data is stored in a table named 'df'.
264
+ For example, to select 10 rows:
265
+ SELECT * FROM df LIMIT 10
266
+ Describe the table:
267
+ DESCRIBE TABLE df
268
+ """
269
+ number_cells = st.sidebar.number_input("Number of SQL cells to use", value=1, max_value=40)
270
+ for i in range(number_cells):
271
+ col1, col2 = st.columns([2, 1])
272
+ st.markdown("<br>", unsafe_allow_html=True)
273
+ col1.write(f"> `IN[{i+1}]`")
274
+ show_panel = col2.checkbox("Show cell config panel", key="sql")
275
+ key = f"sql{i}"
276
+ sql = code_editor("sql", hint, show_panel=show_panel, key=key)
277
+ if sql:
278
+ st.code(sql, language="sql")
279
+ st.write(f"`OUT[{i+1}]`")
280
+ res = query_data(sql, df)
281
+ display_results(sql, res, f"{key}{sql}")
282
+
283
+ # run and dexectue python script
284
+ def python_cells():
285
+ st.write("---")
286
+ st.header("Python")
287
+ hint = """Type Python command (one-liner) to execute or manipulate the dataframe e.g. `df.sample(7)`. By default, results are rendered using `st.write()`.
288
+ πŸ“Š Visulaization example: from "movies" dataset, plot average rating by genre:
289
+ st.line_chart(df.groupby("Genre")[["RottenTomatoes", "AudienceScore"]].mean())
290
+ πŸ—Ί Maps example: show the top 10 populated cities in the world on map (from "Cities Population" dataset)
291
+ st.map(df.sort_values(by='population', ascending=False)[:10])
292
+
293
+ NOTE: for multi-lines, a semi-colon can be used to end each line e.g.
294
+ print("first line");
295
+ print("second line);
296
+ """
297
+ help = """
298
+ For multiple lines, use semicolons e.g.
299
+
300
+ ```python
301
+
302
+ fig, ax = plt.subplots();
303
+ ax.hist(df[[col1, col2]]);
304
+ st.pyplot(fig);
305
+ ```
306
+ or
307
+
308
+ ```python
309
+ groups = [group for _, group in df.groupby('class')];
310
+ for i in range(3):
311
+ st.write(groups[i]['name'].iloc[0])
312
+ st.bar_chart(groups[i].mean())
313
+ ```
314
+ """
315
+ number_cells = st.sidebar.number_input("Number of Python cells to use", value=1, max_value=40, min_value=1, help=help)
316
+ for i in range(number_cells):
317
+ st.markdown("<br><br><br>", unsafe_allow_html=True)
318
+ col1, col2 = st.columns([2, 1])
319
+ col1.write(f"> `IN[{i+1}]`")
320
+ show_panel = col2.checkbox("Show cell config panel", key=f"panel{i}")
321
+ user_script = code_editor("python", hint, show_panel=show_panel, key=i)
322
+ if user_script:
323
+ df.rename(columns={"lng": "lon"}, inplace=True) # hot-fix for "World Population" dataset
324
+ st.code(user_script, language="python")
325
+ st.write(f"`OUT[{i+1}]`")
326
+ run_python_script(user_script, key=f"{user_script}{i}")
327
+
328
+
329
+ if st.sidebar.checkbox("Show SQL cells", value=True):
330
+ sql_cells(df)
331
+ if st.sidebar.checkbox("Show Python cells", value=True):
332
+ python_cells()
333
+
334
+ st.sidebar.write("---")
335
+
336
+ if st.sidebar.checkbox("Generate Data Profile Report", help="pandas profiling, generated by [ydata-profiling](https://github.com/ydataai/ydata-profiling)"):
337
+ st.write("---")
338
+ st.header("Data Profiling")
339
+ profile = data_profiler(df)
340
+ st_profile_report(profile)
341
+
342
+ st.write("---")