Spaces:
Runtime error
Runtime error
| """ | |
| TODOS: | |
| - Improve prompts | |
| - Improve model usage (Quantization?) | |
| - Improve error handling | |
| - Add more tests | |
| - Improve response in a friendly way | |
| """ | |
| import gradio as gr | |
| from gradio_huggingfacehub_search import HuggingfaceHubSearch | |
| import duckdb | |
| import pandas as pd | |
| import requests | |
| from outlines import prompt | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import spaces | |
| import json | |
| import torch | |
| import logging | |
| BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" | |
| logger = logging.getLogger(__name__) | |
| """ | |
| Methods for generating potential questions and SQL queries | |
| """ | |
| device = "cuda" | |
| gemma_model_id = "google/gemma-2b-it" | |
| gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_model_id) | |
| gemma_model = AutoModelForCausalLM.from_pretrained( | |
| gemma_model_id, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| def generate_potential_questions_with_gemma(prompt): | |
| input_ids = gemma_tokenizer(prompt, return_tensors="pt").to(device) | |
| outputs = gemma_model.generate(**input_ids, max_new_tokens=1024) | |
| return gemma_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| def prompt_for_questions(dataset, schema, first_rows): | |
| """ | |
| You are a data analyst tasked with exploring a dataset named {{ dataset }}. | |
| Below is the dataset schema in SQL format along with a sample of 3 rows: | |
| {{ schema }} | |
| Sample rows: | |
| {% for example in first_rows %} | |
| {{ example}} | |
| {% endfor %} | |
| Your goal is to generate a list of 5 potential questions that a user might want | |
| to ask about this dataset. Consider the information contained in the provided | |
| columns and rows, and try to think of meaningful questions that could | |
| provide insights or useful information. For each question, provide the SQL query | |
| that would extract the relevant information from the dataset. | |
| Ouput JSON format: | |
| { | |
| "questions": [ | |
| {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"}, | |
| {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"}, | |
| {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"}, | |
| {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"}, | |
| {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"}, | |
| ] | |
| } | |
| Please ensure that each SQL query retrieves relevant information from the dataset to answer the corresponding question accurately. | |
| Return only the JSON object, do not add extra information. | |
| """ | |
| """ | |
| Methods for generating and SQL based on a user request | |
| """ | |
| mother_duckdb_model_id = "motherduckdb/DuckDB-NSQL-7B-v0.1" | |
| mother_duck_tokenizer = AutoTokenizer.from_pretrained(mother_duckdb_model_id) | |
| mother_duck_model = AutoModelForCausalLM.from_pretrained( | |
| mother_duckdb_model_id, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| def generate_sql_with_mother_duck(prompt): | |
| input_ids = mother_duck_tokenizer(prompt, return_tensors="pt").to(device).input_ids | |
| generated_ids = mother_duck_model.generate(input_ids, max_length=1024) | |
| return mother_duck_tokenizer.decode(generated_ids[0], skip_special_tokens=True) | |
| def prompt_for_sql(ddl_create, query_input): | |
| """ | |
| ### Instruction: | |
| Your task is to generate valid duckdb SQL to answer the following question. | |
| ### Input: | |
| Here is the database schema that the SQL query will run on: | |
| {{ ddl_create }} | |
| ### Question: | |
| {{ query_input }} | |
| ### Response (use duckdb shorthand if possible): | |
| """ | |
| """ | |
| Datasets Viewer Methods | |
| https://huggingface.co/docs/datasets-server/index | |
| """ | |
| def get_first_parquet(dataset: str): | |
| resp = requests.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset}") | |
| return resp.json()["parquet_files"][0] | |
| def get_dataset_schema(parquet_url: str): | |
| con = duckdb.connect() | |
| con.execute(f"CREATE TABLE data as SELECT * FROM '{parquet_url}' LIMIT 1;") | |
| result = con.sql("SELECT sql FROM duckdb_tables() where table_name ='data';").df() | |
| ddl_create = result.iloc[0,0] | |
| con.close() | |
| return ddl_create | |
| def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int): | |
| resp = requests.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}") | |
| rows = resp.json()["rows"] | |
| rows = [row['row'] for row in rows] | |
| return pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit) | |
| """ | |
| Main logic, to get the recommended queries | |
| """ | |
| def get_recommended_queries(dataset: str): | |
| ddl_create, prompt = "", "" | |
| try: | |
| first_split = get_first_parquet(dataset) | |
| df_first_rows = get_first_rows_as_df(dataset, first_split["config"], first_split["split"], 3) | |
| first_parquet_url = first_split["url"] | |
| logger.info(f"First parquet URL: {first_parquet_url}") | |
| ddl_create = get_dataset_schema(first_parquet_url) | |
| prompt = prompt_for_questions(dataset, ddl_create, df_first_rows.to_dict('records')) | |
| txt_questions = generate_potential_questions_with_gemma(prompt).split("``json")[1].replace('\n', ' ').strip()[:-4] | |
| data = json.loads(txt_questions) | |
| questions = data["questions"] | |
| potential_questions = [] | |
| for question in questions: | |
| try: | |
| sql = question["sql_query"].replace("FROM data", f"FROM '{first_parquet_url}'") | |
| result = duckdb.sql(sql).df() | |
| potential_questions.append({"question": question["question"], "result": result, "sql_query": sql}) | |
| continue | |
| except Exception as err: | |
| logger.error(f"Error in running SQL query: {question['sql_query']} {err}") | |
| mother_duck_prompt = prompt_for_sql(ddl_create, question["question"]) | |
| sql = generate_sql_with_mother_duck(mother_duck_prompt).split("### Response (use duckdb shorthand if possible):")[-1].strip() | |
| sql = sql.replace("FROM data", f"FROM '{first_parquet_url}'") | |
| try: | |
| result = duckdb.sql(sql).df() | |
| potential_questions.append({"question": question["question"], "result": result, "sql_query": sql}) | |
| except: | |
| pass | |
| df_result = pd.DataFrame(potential_questions) | |
| except Exception as err: | |
| logger.error(f"Error in getting recommended queries: {err}") | |
| return { | |
| gr_txt_ddl: ddl_create, | |
| gr_txt_prompt: prompt, | |
| gr_df_result: pd.DataFrame([{"error": f"β {err=}"}]) | |
| } | |
| return { | |
| gr_txt_ddl: ddl_create, | |
| gr_txt_prompt: prompt, | |
| gr_df_result: df_result | |
| } | |
| def preview_dataset(dataset: str): | |
| try: | |
| first_split = get_first_parquet(dataset) | |
| df = get_first_rows_as_df(dataset, first_split["config"], first_split["split"], 4) | |
| except Exception as err: | |
| df = pd.DataFrame([{"Unable to preview dataset": f"β {err=}"}]) | |
| return { | |
| gr_df_first_rows: df | |
| } | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π« Dataset Insights Explorer π«") | |
| gr_dataset_name = HuggingfaceHubSearch( | |
| label="Hub Dataset ID", | |
| placeholder="Search for dataset id on Huggingface", | |
| search_type="dataset", | |
| value="jamescalam/world-cities-geo", | |
| ) | |
| gr_preview_btn = gr.Button("Preview Dataset") | |
| gr_df_first_rows = gr.DataFrame(datatype="markdown") | |
| gr_recommend_btn = gr.Button("Show Insights") | |
| gr_df_result = gr.DataFrame(datatype="markdown") | |
| with gr.Accordion("Open for details", open=False): | |
| gr_txt_ddl = gr.Textbox(label="Dataset as CREATE DDL", interactive= False) | |
| gr_txt_prompt = gr.Textbox(label="Generated prompt to get recommended questions", interactive= False) | |
| gr_preview_btn.click(preview_dataset, inputs=[gr_dataset_name], outputs=[gr_df_first_rows]) | |
| gr_recommend_btn.click(get_recommended_queries, inputs=[gr_dataset_name], outputs=[gr_txt_ddl, gr_txt_prompt, gr_df_result]) | |
| demo.launch() | |