lint commited on
Commit
0822b7f
1 Parent(s): e8ac366

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +23 -0
  2. gsql_app.py +124 -0
  3. query.py +3 -0
  4. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ # Set up a new user named "user" with user ID 1000
10
+ RUN useradd -m -u 1000 user
11
+ # Switch to the "user" user
12
+ USER user
13
+ # Set home to the user's home directory
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH
16
+
17
+ # Set the working directory to the user's home directory
18
+ WORKDIR $HOME/app
19
+
20
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
21
+ COPY --chown=user . $HOME/app
22
+
23
+ RUN python gsql_app.py
gsql_app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import duckdb
3
+ import pandas as pd
4
+ import gradio as gr
5
+ from datasets import load_dataset
6
+ import tempfile
7
+ import re
8
+ from query import sql_query
9
+
10
+ max_rows = 20
11
+ df_display_kwargs = dict(
12
+ wrap = True,
13
+ max_rows = max_rows,
14
+ type = "pandas",
15
+ row_count = 3,
16
+ col_count = 4,
17
+ )
18
+
19
+ dataset_choices = [
20
+ "rotten_tomatoes",
21
+ "sciq",
22
+ ]
23
+
24
+ def apply_sql(input_table, sql_query):
25
+
26
+ # Use regex to extract the table name from the SQL query
27
+ match = re.search(r"\bFROM\s+(\w+)", sql_query, re.IGNORECASE)
28
+ if match:
29
+ table_name = match.group(1)
30
+
31
+ sql_query = sql_query.replace(table_name, "input_table")
32
+
33
+ output_df = duckdb.query(sql_query).to_df()
34
+
35
+ return output_df
36
+
37
+ def display_dataset(dataset_id):
38
+
39
+ dataset = load_dataset(dataset_id, split="train")
40
+ df = dataset.to_pandas()
41
+ return df, df
42
+
43
+ def upload_dataset(dataset_file):
44
+
45
+ if dataset_file is None:
46
+ return None, None
47
+
48
+ print(dataset_file.name)
49
+
50
+ df = pd.read_csv(dataset_file.name)
51
+
52
+ return df, df
53
+
54
+
55
+ def process_dataset(full_dataset, sql_query):
56
+ input_table = full_dataset
57
+ output_df = duckdb.query(sql_query).to_df()
58
+
59
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
60
+ file_path = temp_file.name
61
+ output_df.to_csv(file_path)
62
+
63
+ return output_df, file_path
64
+
65
+
66
+ theme = gr.themes.Soft(
67
+ primary_hue="blue",
68
+ neutral_hue="slate",
69
+ )
70
+
71
+
72
+ with gr.Blocks(analytics_enabled=False, theme=theme) as demo:
73
+ full_dataset = gr.State()
74
+
75
+ with gr.Column():
76
+ with gr.Row().style(equal_height=True):
77
+
78
+ with gr.Column(variant="panel"):
79
+
80
+ with gr.Row():
81
+ dark_mode_btn = gr.Button("Dark Mode", variant="primary")
82
+ load_dataset_button = gr.Button("Load HF Dataset", variant="secondary")
83
+
84
+ dataset_selector = gr.Dropdown(label="HF Dataset", choices=dataset_choices, value=dataset_choices[0])
85
+
86
+
87
+ with gr.Column(variant="compact"):
88
+
89
+ with gr.Row():
90
+ sql_query_btn = gr.Button("Apply SQL Query", variant="secondary")
91
+ download_dataset_btn = gr.Button("Download Queried Dataset", variant="primary")
92
+
93
+ sql_query_comp = gr.Code(language=None, label="SQL Query", lines=3, value=sql_query)
94
+
95
+ with gr.Row().style(equal_height=True):
96
+ upload_dataset_comp = gr.File(label="Upload Dataset")
97
+ download_dataset_comp = gr.File(label="Download Dataset")
98
+
99
+ with gr.Column(variant="panel"):
100
+ input_df_display = gr.Dataframe(**df_display_kwargs, label=f"Input Dataframe (Truncated to first {max_rows} Rows)")
101
+
102
+ output_df_display = gr.Dataframe(**df_display_kwargs, label=f"Output Dataframe (Truncated to first {max_rows} Rows)")
103
+
104
+ load_dataset_button.click(fn=display_dataset, inputs=[dataset_selector], outputs=[input_df_display, full_dataset])
105
+ upload_dataset_comp.change(fn=upload_dataset, inputs=[upload_dataset_comp], outputs=[input_df_display, full_dataset])
106
+
107
+ sql_query_btn.click(fn=apply_sql, inputs=[input_df_display, sql_query_comp], outputs=[output_df_display])
108
+
109
+ download_dataset_btn.click(fn=process_dataset, inputs=[full_dataset, sql_query_comp], outputs=[output_df_display, download_dataset_comp])
110
+
111
+ toggle_dark_mode_args = dict(
112
+ fn=None,
113
+ inputs=None,
114
+ outputs=None,
115
+ _js="""() => {
116
+ if (document.querySelectorAll('.dark').length) {
117
+ document.querySelectorAll('.dark').forEach(el => el.classList.remove('dark'));
118
+ } else {
119
+ document.querySelector('body').classList.add('dark');
120
+ }
121
+ }""",
122
+ )
123
+ demo.load(**toggle_dark_mode_args)
124
+ dark_mode_btn.click(**toggle_dark_mode_args)
query.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ sql_query='''
2
+ SELECT * FROM input_table WHERE text LIKE '%the rock%'
3
+ '''
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==3.33.1
2
+ pandas>=2.0
3
+ duckdb>=0.8.0
4
+ datasets