asoria HF staff commited on
Commit
27e0148
1 Parent(s): ddab0bb

adding initial files

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +96 -0
  3. requirements.txt +8 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Datasets Text2sql
3
- emoji: 🏢
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 4.19.2
8
  app_file: app.py
 
1
  ---
2
+ title: Datasets text2sql
3
+ emoji: 🐣
4
+ colorFrom: green
5
+ colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.19.2
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import duckdb
3
+ import gradio as gr
4
+ from dotenv import load_dotenv
5
+ from httpx import Client
6
+ from huggingface_hub import HfApi
7
+ from huggingface_hub.utils import logging
8
+ from llama_cpp import Llama
9
+
10
+ load_dotenv()
11
+
12
+ HF_TOKEN = os.getenv("HF_TOKEN")
13
+ assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
14
+
15
+
16
+ BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
17
+ API_URL = "https://m82etjwvhoptr3t5.us-east-1.aws.endpoints.huggingface.cloud"
18
+ headers = {
19
+ "Accept" : "application/json",
20
+ "Authorization": f"Bearer {HF_TOKEN}",
21
+ "Content-Type": "application/json"
22
+ }
23
+
24
+ logger = logging.get_logger(__name__)
25
+ client = Client(headers=headers)
26
+ api = HfApi(token=HF_TOKEN)
27
+ llama = Llama(
28
+ model_path="DuckDB-NSQL-7B-v0.1-q8_0.gguf",
29
+ n_ctx=2048,
30
+ )
31
+
32
+ def get_first_parquet(dataset: str):
33
+ resp = client.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset}")
34
+ return resp.json()["parquet_files"][0]
35
+
36
+
37
+ def query_remote_model(text):
38
+ payload = {
39
+ "inputs": text,
40
+ "parameters": {}
41
+ }
42
+ response = client.post(API_URL, headers=headers, json=payload)
43
+ pred = response.json()
44
+ return pred[0]["generated_text"]
45
+
46
+
47
+ def query_local_model(text):
48
+ pred = llama(text, temperature=0.1, max_tokens=500)
49
+ return pred["choices"][0]["text"]
50
+
51
+
52
+ def text2sql(dataset_name, query_input):
53
+ print(f"start text2sql for {dataset_name}")
54
+ try:
55
+ first_parquet = get_first_parquet(dataset_name)
56
+ except Exception as e:
57
+ return f"❌ Dataset does not exist or is not supported {e}"
58
+ first_parquet_url = first_parquet["url"]
59
+ print(first_parquet_url)
60
+ con = duckdb.connect()
61
+ con.execute("INSTALL 'httpfs'; LOAD httpfs;")
62
+ con.execute(f"CREATE TABLE data as SELECT * FROM '{first_parquet_url}' LIMIT 1;")
63
+ result = con.sql("SELECT sql FROM duckdb_tables() where table_name ='data';").df()
64
+ con.close()
65
+
66
+ ddl_create = result.iloc[0,0]
67
+ text = f"""### Instruction:
68
+ Your task is to generate valid duckdb SQL to answer the following question.
69
+
70
+ ### Input:
71
+ Here is the database schema that the SQL query will run on:
72
+ {ddl_create}
73
+ ### Question:
74
+ {query_input}
75
+
76
+ ### Response (use duckdb shorthand if possible):
77
+ """
78
+
79
+ print(text)
80
+
81
+ # sql_output = query_remote_model(text)
82
+
83
+ sql_output = query_local_model(text)
84
+ return sql_output
85
+
86
+
87
+ with gr.Blocks() as demo:
88
+ gr.Markdown("# Talk to your dataset")
89
+ gr.Markdown("This space shows how to talk to your datasets: Get a brief description, create SQL queries, and get results.")
90
+ gr.Markdown("Generate SQL queries'")
91
+ dataset_name = gr.Textbox("sksayril/medicine-info", label="Dataset Name")
92
+ query_input = gr.Textbox("How many rows there are?", label="Ask something about your data")
93
+ btn = gr.Button("Generate SQL")
94
+ query_output = gr.Textbox(label="Output SQL", interactive= False)
95
+ btn.click(text2sql, inputs=[dataset_name, query_input], outputs=query_output)
96
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==4.18.0
2
+ httpx
3
+ huggingface_hub
4
+ pandas
5
+ python-dotenv
6
+ duckdb
7
+ llama-cpp-python
8
+ wurlitzer