Haleshot commited on
Commit
7eca5fd
·
unverified ·
1 Parent(s): fe6a0c9

changes and fixes

Browse files
DuckDB_Loading_CSVs.py → duckdb/DuckDB_Loading_CSVs.py RENAMED
@@ -1,167 +1,175 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "marimo",
5
- # "plotly.express",
6
- # "plotly==6.0.1",
7
- # "duckdb==1.2.1",
8
- # "sqlglot==26.11.1",
9
- # ]
10
- # ///
11
-
12
-
13
- import marimo
14
-
15
- __generated_with = "0.11.17"
16
- app = marimo.App(width="medium")
17
-
18
-
19
- @app.cell
20
- def _():
21
- import marimo as mo
22
- import plotly.express as px
23
- return mo, px
24
-
25
-
26
- @app.cell(hide_code=True)
27
- def _(mo):
28
- mo.md(r"""#Loading CSVs with DuckDB""")
29
- return
30
-
31
-
32
- @app.cell(hide_code=True)
33
- def _(mo):
34
- mo.md(
35
- r"""
36
- <p> I remember when I first learnt about DuckDB, it was a gamechanger - I used to load the data I wanted to work on to a database software like MS SQL Server, and then build a bridge to an IDE with the language I wanted to use like Python, or R; it was quite the hassle. DuckDB changed my whole world - now I could just import the data file into the IDE, or notebook, make a duckdb connection, and there we go! But then, I realized I didn't even need the step of first importing the file using python. I could just query the csv file directly using SQL through a DuckDB connection.</p>
37
-
38
- ##Introduction
39
- <p> I found this dataset on the evolution of AI research by disclipine from <a href= "https://oecd.ai/en/data?selectedArea=ai-research&selectedVisualization=16731"> OECD </a>, and it piqued my interest. I feel like publications in natural language processing drastically jumped in the mid 2010s, and I'm excited to find out if that's the case. </p>
40
-
41
- <p> In this notebook, we'll: </p>
42
- <ul>
43
- <li> Import the CSV file into the notebook</li>
44
- <li> Create another table within the database based on the CSV</li>
45
- <li> Dig into publications on natural language processing have evolved over the years</li>
46
- </ul>
47
- """
48
- )
49
- return
50
-
51
-
52
- @app.cell(hide_code=True)
53
- def _(mo):
54
- mo.md(r"""##Load the CSV""")
55
- return
56
-
57
-
58
- @app.cell
59
- def _(mo):
60
- _df = mo.sql(
61
- f"""
62
- /* Another way to load the CSV could be
63
- SELECT *
64
- FROM read_csv('https://github.com/Mustjaab/Loading_CSVs_in_DuckDB/blob/main/AI_Research_Data.csv')
65
- */
66
- SELECT *
67
- FROM "https://github.com/Mustjaab/Loading_CSVs_in_DuckDB/blob/main/AI_Research_Data.csv"
68
- LIMIT 5;
69
- """
70
- )
71
- return
72
-
73
-
74
- @app.cell(hide_code=True)
75
- def _(mo):
76
- mo.md(r"""##Create Another Table""")
77
- return
78
-
79
-
80
- @app.cell
81
- def _(mo):
82
- Discipline_Analysis = mo.sql(
83
- f"""
84
- -- Build a table based on the CSV where it just contains the specified columns
85
- CREATE TABLE Domain_Analysis AS
86
- SELECT Year, Concept, publications FROM "https://github.com/Mustjaab/Loading_CSVs_in_DuckDB/blob/main/AI_Research_Data.csv"
87
- """
88
- )
89
- return Discipline_Analysis, Domain_Analysis
90
-
91
-
92
- @app.cell
93
- def _(Domain_Analysis, mo):
94
- Analysis = mo.sql(
95
- f"""
96
- SELECT *
97
- FROM Domain_Analysis
98
- GROUP BY Concept, Year, publications
99
- ORDER BY Year
100
- """
101
- )
102
- return (Analysis,)
103
-
104
-
105
- @app.cell
106
- def _(Domain_Analysis, mo):
107
- _df = mo.sql(
108
- f"""
109
- SELECT
110
- AVG(CASE WHEN Year < 2020 THEN publications END) AS avg_pre_2020,
111
- AVG(CASE WHEN Year >= 2020 THEN publications END) AS avg_2020_onward
112
- FROM Domain_Analysis
113
- WHERE Concept = 'Natural language processing';
114
- """
115
- )
116
- return
117
-
118
-
119
- @app.cell
120
- def _(Domain_Analysis, mo):
121
- NLP_Analysis = mo.sql(
122
- f"""
123
- SELECT
124
- publications,
125
- CASE
126
- WHEN Year < 2020 THEN 'Pre-2020'
127
- ELSE '2020-onward'
128
- END AS period
129
- FROM Domain_Analysis
130
- WHERE Year >= 2000
131
- AND Concept = 'Natural language processing';
132
- """,
133
- output=False
134
- )
135
- return (NLP_Analysis,)
136
-
137
-
138
- @app.cell
139
- def _(NLP_Analysis, px):
140
- px.box(NLP_Analysis,x=1,y=0,color = 1)
141
- return
142
-
143
-
144
- @app.cell(hide_code=True)
145
- def _(mo):
146
- mo.md(r"""<p> We can see there's a significant increase in NLP publications 2020 and onwards which definitely makes sense provided the rapid emergence of commercial large langage models, and AI assistants. </p>""")
147
- return
148
-
149
-
150
- @app.cell(hide_code=True)
151
- def _(mo):
152
- mo.md(
153
- r"""
154
- ##Conclusion
155
- <p> In this notebook, we learned how to:</p>
156
- <ul>
157
- <li> Load a CSV into DuckDB </li>
158
- <li> Create other tables using the imported CSV </li>
159
- <li> Seamlessly analyze and visualize data between SQL, and Python cells</li>
160
- </ul>
161
- """
162
- )
163
- return
164
-
165
-
166
- if __name__ == "__main__":
167
- app.run()
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "marimo",
5
+ # "plotly.express",
6
+ # "plotly==6.0.1",
7
+ # "duckdb==1.2.1",
8
+ # "sqlglot==26.11.1",
9
+ # "pyarrow==19.0.1",
10
+ # "polars==1.27.1",
11
+ # ]
12
+ # ///
13
+
14
+ import marimo
15
+
16
+ __generated_with = "0.12.10"
17
+ app = marimo.App(width="medium")
18
+
19
+
20
+ @app.cell(hide_code=True)
21
+ def _(mo):
22
+ mo.md(r"""#Loading CSVs with DuckDB""")
23
+ return
24
+
25
+
26
+ @app.cell(hide_code=True)
27
+ def _(mo):
28
+ mo.md(
29
+ r"""
30
+ <p> I remember when I first learnt about DuckDB, it was a gamechanger — I used to load the data I wanted to work on to a database software like MS SQL Server, and then build a bridge to an IDE with the language I wanted to use like Python, or R; it was quite the hassle. DuckDB changed my whole world — now I could just import the data file into the IDE, or notebook, make a duckdb connection, and there we go! But then, I realized I didn't even need the step of first importing the file using python. I could just query the csv file directly using SQL through a DuckDB connection.</p>
31
+
32
+ ##Introduction
33
+ <p> I found this dataset on the evolution of AI research by disclipine from <a href= "https://oecd.ai/en/data?selectedArea=ai-research&selectedVisualization=16731"> OECD</a>, and it piqued my interest. I feel like publications in natural language processing drastically jumped in the mid 2010s, and I'm excited to find out if that's the case. </p>
34
+
35
+ <p> In this notebook, we'll: </p>
36
+ <ul>
37
+ <li> Import the CSV file into the notebook</li>
38
+ <li> Create another table within the database based on the CSV</li>
39
+ <li> Dig into publications on natural language processing have evolved over the years</li>
40
+ </ul>
41
+ """
42
+ )
43
+ return
44
+
45
+
46
+ @app.cell(hide_code=True)
47
+ def _(mo):
48
+ mo.md(r"""##Load the CSV""")
49
+ return
50
+
51
+
52
+ @app.cell
53
+ def _(mo):
54
+ _df = mo.sql(
55
+ f"""
56
+ /* Another way to load the CSV could be
57
+ SELECT *
58
+ FROM read_csv('https://github.com/Mustjaab/Loading_CSVs_in_DuckDB/blob/main/AI_Research_Data.csv')
59
+ */
60
+ SELECT *
61
+ FROM "https://raw.githubusercontent.com/Mustjaab/Loading_CSVs_in_DuckDB/refs/heads/main/AI_Research_Data.csv"
62
+ LIMIT 5;
63
+ """
64
+ )
65
+ return
66
+
67
+
68
+ @app.cell(hide_code=True)
69
+ def _(mo):
70
+ mo.md(r"""##Create Another Table""")
71
+ return
72
+
73
+
74
+ @app.cell
75
+ def _(mo):
76
+ Discipline_Analysis = mo.sql(
77
+ f"""
78
+ -- Build a table based on the CSV where it just contains the specified columns
79
+ CREATE TABLE Domain_Analysis AS
80
+ SELECT Year, Concept, publications FROM "https://raw.githubusercontent.com/Mustjaab/Loading_CSVs_in_DuckDB/refs/heads/main/AI_Research_Data.csv"
81
+ """
82
+ )
83
+ return Discipline_Analysis, Domain_Analysis
84
+
85
+
86
+ @app.cell
87
+ def _(Domain_Analysis, mo):
88
+ Analysis = mo.sql(
89
+ f"""
90
+ SELECT *
91
+ FROM Domain_Analysis
92
+ GROUP BY Concept, Year, publications
93
+ ORDER BY Year
94
+ """
95
+ )
96
+ return (Analysis,)
97
+
98
+
99
+ @app.cell
100
+ def _(Domain_Analysis, mo):
101
+ _df = mo.sql(
102
+ f"""
103
+ SELECT
104
+ AVG(CASE WHEN Year < 2020 THEN publications END) AS avg_pre_2020,
105
+ AVG(CASE WHEN Year >= 2020 THEN publications END) AS avg_2020_onward
106
+ FROM Domain_Analysis
107
+ WHERE Concept = 'Natural language processing';
108
+ """
109
+ )
110
+ return
111
+
112
+
113
+ @app.cell
114
+ def _(Domain_Analysis, mo):
115
+ NLP_Analysis = mo.sql(
116
+ f"""
117
+ SELECT
118
+ publications,
119
+ CASE
120
+ WHEN Year < 2020 THEN 'Pre-2020'
121
+ ELSE '2020-onward'
122
+ END AS period
123
+ FROM Domain_Analysis
124
+ WHERE Year >= 2000
125
+ AND Concept = 'Natural language processing';
126
+ """,
127
+ output=False
128
+ )
129
+ return (NLP_Analysis,)
130
+
131
+
132
+ @app.cell
133
+ def _(NLP_Analysis, px):
134
+ px.box(NLP_Analysis, x='period', y='publications', color='period')
135
+ return
136
+
137
+
138
+ @app.cell(hide_code=True)
139
+ def _(mo):
140
+ mo.md(r"""<p> We can see there's a significant increase in NLP publications 2020 and onwards which definitely makes sense provided the rapid emergence of commercial large langage models, and AI assistants. </p>""")
141
+ return
142
+
143
+
144
+ @app.cell(hide_code=True)
145
+ def _(mo):
146
+ mo.md(
147
+ r"""
148
+ ##Conclusion
149
+ <p> In this notebook, we learned how to:</p>
150
+ <ul>
151
+ <li> Load a CSV into DuckDB </li>
152
+ <li> Create other tables using the imported CSV </li>
153
+ <li> Seamlessly analyze and visualize data between SQL, and Python cells</li>
154
+ </ul>
155
+ """
156
+ )
157
+ return
158
+
159
+
160
+ @app.cell
161
+ def _():
162
+ import pyarrow
163
+ import polars
164
+ return polars, pyarrow
165
+
166
+
167
+ @app.cell
168
+ def _():
169
+ import marimo as mo
170
+ import plotly.express as px
171
+ return mo, px
172
+
173
+
174
+ if __name__ == "__main__":
175
+ app.run()