Spaces:

Labelbox
/

databricks_upload

Sleeping

App Files Files Community

databricks_upload / app.py

ksaramout

Upload app.py

4ec16ee over 1 year ago

raw

history blame contribute delete

25 kB

	import streamlit as st
	import requests
	import re
	import json
	import time
	import pandas as pd
	import labelbox

	@st.cache_data(show_spinner=True)
	def fetch_databases(cluster_id, formatted_title, databricks_api_key):
	query = "SHOW DATABASES;"
	return execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)

	# Cached function to fetch tables
	@st.cache_data(show_spinner=True)
	def fetch_tables(selected_database, cluster_id, formatted_title, databricks_api_key):
	query = f"SHOW TABLES IN {selected_database};"
	return execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)

	# Cached function to fetch columns
	@st.cache_data(show_spinner=True)
	def fetch_columns(selected_database, selected_table, cluster_id, formatted_title, databricks_api_key):
	query = f"SHOW COLUMNS IN {selected_database}.{selected_table};"
	return execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)

	def validate_dataset_name(name):
	"""Validate the dataset name."""
	# Check length
	if len(name) > 256:
	return "Dataset name should be limited to 256 characters."
	# Check allowed characters
	allowed_characters_pattern = re.compile(r'^[A-Za-z0-9 _\-.,()\/]+$')
	if not allowed_characters_pattern.match(name):
	return ("Dataset name can only contain letters, numbers, spaces, and the following punctuation symbols: _-.,()/. Other characters are not supported.")
	return None

	def create_new_dataset_labelbox (new_dataset_name):
	client = labelbox.Client(api_key=labelbox_api_key)
	dataset_name = new_dataset_name
	dataset = client.create_dataset(name=dataset_name)
	dataset_id = dataset.uid
	return dataset_id


	def get_dataset_from_labelbox(labelbox_api_key):
	client = labelbox.Client(api_key=labelbox_api_key)
	datasets = client.get_datasets()
	return datasets

	def destroy_databricks_context(cluster_id, context_id, domain, databricks_api_key):
	DOMAIN = f"https://{domain}"
	TOKEN = f"Bearer {databricks_api_key}"

	headers = {
	"Authorization": TOKEN,
	"Content-Type": "application/json",
	}

	# Destroy context
	destroy_payload = {
	"clusterId": cluster_id,
	"contextId": context_id
	}
	destroy_response = requests.post(
	f"{DOMAIN}/api/1.2/contexts/destroy",
	headers=headers,
	data=json.dumps(destroy_payload)
	)

	if destroy_response.status_code != 200:
	raise ValueError("Failed to destroy context.")

	def execute_databricks_query(query, cluster_id, domain, databricks_api_key):
	DOMAIN = f"https://{domain}"
	TOKEN = f"Bearer {databricks_api_key}"

	headers = {
	"Authorization": TOKEN,
	"Content-Type": "application/json",
	}

	# Create context
	context_payload = {
	"clusterId": cluster_id,
	"language": "sql"
	}
	context_response = requests.post(
	f"{DOMAIN}/api/1.2/contexts/create",
	headers=headers,
	data=json.dumps(context_payload)
	)
	context_response_data = context_response.json()

	if 'id' not in context_response_data:
	raise ValueError("Failed to create context.")
	context_id = context_response_data['id']

	# Execute query
	command_payload = {
	"clusterId": cluster_id,
	"contextId": context_id,
	"language": "sql",
	"command": query
	}
	command_response = requests.post(
	f"{DOMAIN}/api/1.2/commands/execute",
	headers=headers,
	data=json.dumps(command_payload)
	).json()

	if 'id' not in command_response:
	raise ValueError("Failed to execute command.")
	command_id = command_response['id']

	# Wait for the command to complete
	while True:
	status_response = requests.get(
	f"{DOMAIN}/api/1.2/commands/status",
	headers=headers,
	params={
	"clusterId": cluster_id,
	"contextId": context_id,
	"commandId": command_id
	}
	).json()

	command_status = status_response.get("status")

	if command_status == "Finished":
	break
	elif command_status in ["Error", "Cancelled"]:
	raise ValueError(f"Command {command_status}. Reason: {status_response.get('results', {}).get('summary')}")
	else:
	time.sleep(1) # Wait for 5 seconds before checking again

	# Convert the results into a pandas DataFrame
	data = status_response.get('results', {}).get('data', [])
	columns = [col['name'] for col in status_response.get('results', {}).get('schema', [])]
	df = pd.DataFrame(data, columns=columns)

	destroy_databricks_context(cluster_id, context_id, domain, databricks_api_key)

	return df


	st.title("Labelbox 🤝 Databricks")
	st.header("Pipeline Creator", divider='rainbow')



	def is_valid_url_or_uri(value):
	"""Check if the provided value is a valid URL or URI."""
	# Check general URLs
	url_pattern = re.compile(
	r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\$\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
	)

	# Check general URIs including cloud storage URIs (like gs://, s3://, etc.)
	uri_pattern = re.compile(
	r'^(?:[a-z][a-z0-9+.-]:\|/)(?:/?[^\s])?$\|^(gs\|s3\|azure\|blob)://[^\s]+'
	)

	return url_pattern.match(value) or uri_pattern.match(value)



	is_preview = st.toggle('Run in Preview Mode', value=False)
	if is_preview:
	st.success('Running in Preview mode!', icon="✅")
	else:
	st.success('Running in Production mode!', icon="✅")

	st.subheader("Tell us about your Databricks and Labelbox environments", divider='grey')
	#cloud = "GCP"
	cloud = st.selectbox('Which cloud environment does your Databricks Workspace run in?', ['AWS', 'Azure', 'GCP'], index=None)
	title = st.text_input('Enter Databricks Domain (e.g., <instance>.<cloud>.databricks.com)', '')
	databricks_api_key = st.text_input('Databricks API Key', type='password')
	labelbox_api_key = st.text_input('Labelbox API Key', type='password')

	# After Labelbox API key is entered
	if labelbox_api_key:
	# Fetching datasets
	datasets = get_dataset_from_labelbox(labelbox_api_key)
	create_new_dataset = st.toggle("Make me a new dataset", value=False)

	if not create_new_dataset:
	# The existing logic for selecting datasets goes here.
	dataset_name_to_id = {dataset.name: dataset.uid for dataset in datasets}
	selected_dataset_name = st.selectbox("Select an existing dataset:", list(dataset_name_to_id.keys()))
	dataset_id = dataset_name_to_id[selected_dataset_name]

	else:
	# If user toggles "make me a new dataset"
	new_dataset_name = st.text_input("Enter the new dataset name:")

	# Check if the name is valid
	if new_dataset_name:
	validation_message = validate_dataset_name(new_dataset_name)
	if validation_message:
	st.error(validation_message, icon="🚫")
	else:
	st.success(f"Valid dataset name! Dataset_id", icon="✅")
	dataset_name = new_dataset_name

	# Define the variables beforehand with default values (if not defined)
	new_dataset_name = new_dataset_name if 'new_dataset_name' in locals() else None
	selected_dataset_name = selected_dataset_name if 'selected_dataset_name' in locals() else None

	if new_dataset_name or selected_dataset_name:
	# Handling various formats of input
	formatted_title = re.sub(r'^https?://', '', title) # Remove http:// or https://
	formatted_title = re.sub(r'/$', '', formatted_title) # Remove trailing slash if present

	if formatted_title:
	st.subheader("Select an existing cluster", divider='grey', help="Jobs will use job clusters to reduce DBUs consumed.")
	DOMAIN = f"https://{formatted_title}"
	TOKEN = f"Bearer {databricks_api_key}"

	HEADERS = {
	"Authorization": TOKEN,
	"Content-Type": "application/json",
	}

	# Endpoint to list clusters
	ENDPOINT = "/api/2.0/clusters/list"

	try:
	response = requests.get(DOMAIN + ENDPOINT, headers=HEADERS)
	response.raise_for_status()

	# Include clusters with cluster_source "UI" or "API"
	clusters = response.json().get("clusters", [])
	cluster_dict = {
	cluster["cluster_name"]: cluster["cluster_id"]
	for cluster in clusters if cluster.get("cluster_source") in ["UI", "API"]
	}

	# Display dropdown with cluster names

	if cluster_dict:
	selected_cluster_name = st.selectbox(
	'Select a cluster to run on',
	list(cluster_dict.keys()),
	key='unique_key_for_cluster_selectbox',
	index=None,
	placeholder="Select a cluster..",
	)
	if selected_cluster_name:
	cluster_id = cluster_dict[selected_cluster_name]

	except requests.RequestException as e:
	st.write(f"Error communicating with Databricks API: {str(e)}")
	except ValueError:
	st.write("Received unexpected response from Databricks API.")

	if selected_cluster_name and cluster_id:
	# Check if the selected cluster is running
	cluster_state = [cluster["state"] for cluster in clusters if cluster["cluster_id"] == cluster_id][0]

	# If the cluster is not running, start it
	if cluster_state != "RUNNING":
	with st.spinner("Starting the selected cluster. This typically takes 10 minutes. Please wait..."):
	start_response = requests.post(f"{DOMAIN}/api/2.0/clusters/start", headers=HEADERS, json={"cluster_id": cluster_id})
	start_response.raise_for_status()

	# Poll until the cluster is up or until timeout
	start_time = time.time()
	timeout = 1200 # 20 minutes in seconds
	while True:
	cluster_response = requests.get(f"{DOMAIN}/api/2.0/clusters/get", headers=HEADERS, params={"cluster_id": cluster_id}).json()
	if "state" in cluster_response:
	if cluster_response["state"] == "RUNNING":
	break
	elif cluster_response["state"] in ["TERMINATED", "ERROR"]:
	st.write(f"Error starting cluster. Current state: {cluster_response['state']}")
	break

	if (time.time() - start_time) > timeout:
	st.write("Timeout reached while starting the cluster.")
	break

	time.sleep(10) # Check every 10 seconds

	st.success(f"{selected_cluster_name} is now running!", icon="🏃‍♂️")
	else:
	st.success(f"{selected_cluster_name} is already running!", icon="🏃‍♂️")


	def generate_cron_expression(freq, hour=0, minute=0, day_of_week=None, day_of_month=None):
	"""
	Generate a cron expression based on the provided frequency and time.
	"""
	if freq == "1 minute":
	return "0 * * * * ?"
	elif freq == "1 hour":
	return f"0 {minute} * * * ?"
	elif freq == "1 day":
	return f"0 {minute} {hour} * * ?"
	elif freq == "1 week":
	if not day_of_week:
	raise ValueError("Day of week not provided for weekly frequency.")
	return f"0 {minute} {hour} ? * {day_of_week}"
	elif freq == "1 month":
	if not day_of_month:
	raise ValueError("Day of month not provided for monthly frequency.")
	return f"0 {minute} {hour} {day_of_month} * ?"
	else:
	raise ValueError("Invalid frequency provided")

	# Streamlit UI
	st.subheader("Run Frequency", divider='grey')

	# Dropdown to select frequency
	freq_options = ["1 day", "1 week", "1 month"]
	selected_freq = st.selectbox("Select frequency:", freq_options, placeholder="Select frequency..")

	day_of_week = None
	day_of_month = None

	# If the frequency is hourly, daily, weekly, or monthly, ask for a specific time
	if selected_freq != "1 minute":

	if selected_freq == "1 week":
	days_options = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]
	day_of_week = st.selectbox("Select day of the week:", days_options)

	elif selected_freq == "1 month":
	day_of_month = st.selectbox("Select day of the month:", list(range(1, 32)))

	col1, col2 = st.columns(2)
	with col1:
	hour = st.selectbox("Hour:", list(range(0, 24)))
	with col2:
	minute = st.selectbox("Minute:", list(range(0, 60)))

	else:
	hour, minute = 0, 0

	# Generate the cron expression
	frequency = generate_cron_expression(selected_freq, hour, minute, day_of_week, day_of_month)

	# Assumed DBU consumption rate for a 32GB, 4-core node per hour
	X = 1 # Replace this with the actual rate from Databricks' pricing or documentation

	# Calculate DBU consumption for a single run
	min_dbu_single_run = (X/6) * (1 + 10) # Assuming maximum scaling to 10 workers
	max_dbu_single_run = (2X/3) (1 + 10)

	# Estimate monthly DBU consumption based on frequency
	if freq_options == "1 day":
	min_dbu_monthly = 30 * min_dbu_single_run
	max_dbu_monthly = 30 * max_dbu_single_run
	elif freq_options == "1 week":
	min_dbu_monthly = 4 * min_dbu_single_run
	max_dbu_monthly = 4 * max_dbu_single_run
	else: # Monthly
	min_dbu_monthly = min_dbu_single_run
	max_dbu_monthly = max_dbu_single_run

	# Calculate runs per month
	if selected_freq == "1 day":
	runs_per_month = 30
	elif selected_freq == "1 week":
	runs_per_month = 4
	else: # "1 month"
	runs_per_month = 1

	# Calculate estimated DBU consumption per month
	min_dbu_monthly = runs_per_month * min_dbu_single_run
	max_dbu_monthly = runs_per_month * max_dbu_single_run

	def generate_human_readable_message(freq, hour=0, minute=0, day_of_week=None, day_of_month=None):
	"""
	Generate a human-readable message for the scheduling.
	"""
	if freq == "1 minute":
	return "Job will run every minute."
	elif freq == "1 hour":
	return f"Job will run once an hour at minute {minute}."
	elif freq == "1 day":
	return f"Job will run daily at {hour:02}:{minute:02}."
	elif freq == "1 week":
	if not day_of_week:
	raise ValueError("Day of week not provided for weekly frequency.")
	return f"Job will run every {day_of_week} at {hour:02}:{minute:02}."
	elif freq == "1 month":
	if not day_of_month:
	raise ValueError("Day of month not provided for monthly frequency.")
	return f"Job will run once a month on day {day_of_month} at {hour:02}:{minute:02}."
	else:
	raise ValueError("Invalid frequency provided")

	# Generate the human-readable message
	readable_msg = generate_human_readable_message(selected_freq, hour, minute, day_of_week, day_of_month)

	# Main code block
	if frequency:
	st.success(readable_msg, icon="📅")
	# Display the estimated DBU consumption to the user
	st.warning(f"Estimated DBU Consumption:\n- For a single run: {min_dbu_single_run:.2f} to {max_dbu_single_run:.2f} DBUs\n- Monthly (based on {runs_per_month} runs): {min_dbu_monthly:.2f} to {max_dbu_monthly:.2f} DBUs")

	# Disclaimer
	st.info("Disclaimer: This is only an estimation. Always monitor the job in Databricks to assess actual DBU consumption.")

	st.subheader("Select a table", divider="grey")

	# Fetching databases
	result_data = fetch_databases(cluster_id, formatted_title, databricks_api_key)
	database_names = result_data['databaseName'].tolist()
	selected_database = st.selectbox("Select a Database:", database_names, index=None, placeholder="Select a database..")

	if selected_database:
	# Fetching tables
	result_data = fetch_tables(selected_database, cluster_id, formatted_title, databricks_api_key)
	table_names = result_data['tableName'].tolist()
	selected_table = st.selectbox("Select a Table:", table_names, index=None, placeholder="Select a table..")

	if selected_table:
	# Fetching columns
	result_data = fetch_columns(selected_database, selected_table, cluster_id, formatted_title, databricks_api_key)
	column_names = result_data['col_name'].tolist()

	st.subheader("Map table schema to Labelbox schema", divider="grey")
	# Your existing code to handle schema mapping...

	# Fetch the first 5 rows of the selected table
	with st.spinner('Fetching first 5 rows of the selected table...'):
	query = f"SELECT * FROM {selected_database}.{selected_table} LIMIT 5;"
	table_sample_data = execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)
	st.write(table_sample_data)

	# Define two columns for side-by-side selectboxes
	col1, col2 = st.columns(2)

	with col1:
	selected_row_data = st.selectbox(
	"row_data (required):",
	column_names,
	index=None,
	placeholder="Select a column..",
	help="Select the column that contains the URL/URI bucket location of the data rows you wish to import into Labelbox."
	)

	with col2:
	selected_global_key = st.selectbox(
	"global_key (optional):",
	column_names,
	index=None,
	placeholder="Select a column..",
	help="Select the column that contains the global key. If not provided, a new key will be generated for you."
	)

	# Fetch a single row from the selected table
	query_sample_row = f"SELECT * FROM {selected_database}.{selected_table} LIMIT 1;"
	result_sample = execute_databricks_query(query_sample_row, cluster_id, formatted_title, databricks_api_key)

	if selected_row_data:
	# Extract the value from the selected row_data column
	sample_row_data_value = result_sample[selected_row_data].iloc[0]

	# Validate the extracted value
	dataset_id = create_new_dataset_labelbox(new_dataset_name) if create_new_dataset else dataset_id
	# Mode
	mode = "preview" if is_preview else "production"

	# Databricks instance and API key
	databricks_instance = formatted_title
	databricks_api_key = databricks_api_key

	# Dataset ID and New Dataset
	new_dataset = 1 if create_new_dataset else 0
	dataset_id = dataset_id

	# Table Path
	table_path = f"{selected_database}.{selected_table}"
	# Frequency
	frequency = frequency

	# Schema Map
	row_data_input = selected_row_data
	global_key_input = selected_global_key

	# Create the initial dictionary
	schema_map_dict = {'row_data': row_data_input}
	if global_key_input:
	schema_map_dict['global_key'] = global_key_input

	# Swap keys and values
	reversed_schema_map_dict = {v: k for k, v in schema_map_dict.items()}

	# Convert the reversed dictionary to a stringified JSON
	reversed_schema_map_str = json.dumps(reversed_schema_map_dict)


	data = {
	"cloud": cloud,
	"mode": mode,
	"databricks_instance": databricks_instance,
	"databricks_api_key": databricks_api_key,
	"new_dataset": new_dataset,
	"dataset_id": dataset_id,
	"table_path": table_path,
	"labelbox_api_key": labelbox_api_key,
	"frequency": frequency,
	"new_cluster": 0,
	"cluster_id": cluster_id,
	"schema_map": reversed_schema_map_str
	}


	if st.button("Deploy Pipeline!", type="primary"):
	# Ensure all fields are filled out
	required_fields = [
	mode, databricks_instance, databricks_api_key, new_dataset, dataset_id,
	table_path, labelbox_api_key, frequency, cluster_id, reversed_schema_map_str
	]


	# Sending a POST request to the Flask app endpoint
	with st.spinner("Deploying pipeline..."):
	response = requests.post("https://us-central1-dbt-prod.cloudfunctions.net/deploy-databricks-pipeline", json=data)

	# Check if request was successful
	if response.status_code == 200:
	# Display the response using Streamlit
	st.balloons()
	response = response.json()
	# Extract the job_id
	job_id = response['message'].split('job_id":')[1].split('}')[0]
	from urllib.parse import urlparse, parse_qs

	# Parse the Databricks instance URL to extract the organization ID
	parsed_url = urlparse(formatted_title)
	query_params = parse_qs(parsed_url.query)
	organization_id = query_params.get("o", [""])[0]

	# Generate the Databricks Job URL
	job_url = f"http://{formatted_title}/?o={organization_id}#job/{job_id}"
	st.success(f"Pipeline deployed successfully! [{job_url}]({job_url}) 🚀")
	else:
	st.error(f"Failed to deploy pipeline. Response: {response.text}", icon="🚫")

	st.markdown("""
	<style>
	/* Add a large bottom padding to the main content */
	.main .block-container {
	padding-bottom: 1000px; /* Adjust this value as needed */
	}
	</style>
	""", unsafe_allow_html=True)