Lineage-graph-accelerator / samples /sql_ddl_sample.sql
aamanlamba's picture
Phase 2: Enhanced lineage extraction with export to data catalogs
0510038
-- Sample SQL DDL with complex lineage relationships
-- E-commerce Data Warehouse Schema
-- ============================================
-- RAW LAYER - Source tables
-- ============================================
CREATE TABLE raw.customers (
customer_id INTEGER PRIMARY KEY,
email VARCHAR(255) NOT NULL,
first_name VARCHAR(100),
last_name VARCHAR(100),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
country VARCHAR(50),
segment VARCHAR(50)
);
CREATE TABLE raw.orders (
order_id INTEGER PRIMARY KEY,
customer_id INTEGER REFERENCES raw.customers(customer_id),
order_date DATE NOT NULL,
total_amount DECIMAL(10,2),
currency VARCHAR(3) DEFAULT 'USD',
status VARCHAR(20),
shipping_address_id INTEGER
);
CREATE TABLE raw.products (
product_id INTEGER PRIMARY KEY,
product_name VARCHAR(255) NOT NULL,
category VARCHAR(100),
subcategory VARCHAR(100),
brand VARCHAR(100),
price DECIMAL(10,2),
cost DECIMAL(10,2)
);
CREATE TABLE raw.order_items (
order_item_id INTEGER PRIMARY KEY,
order_id INTEGER REFERENCES raw.orders(order_id),
product_id INTEGER REFERENCES raw.products(product_id),
quantity INTEGER NOT NULL,
unit_price DECIMAL(10,2),
discount_percent DECIMAL(5,2) DEFAULT 0
);
-- ============================================
-- STAGING LAYER - Cleaned data
-- ============================================
CREATE VIEW staging.stg_customers AS
SELECT
customer_id,
LOWER(TRIM(email)) as email,
INITCAP(first_name) as first_name,
INITCAP(last_name) as last_name,
DATE(created_at) as signup_date,
UPPER(country) as country,
COALESCE(segment, 'Unknown') as segment
FROM raw.customers
WHERE email IS NOT NULL;
-- LINEAGE: raw.customers -> staging.stg_customers
CREATE VIEW staging.stg_orders AS
SELECT
order_id,
customer_id,
order_date,
total_amount,
currency,
CASE
WHEN status IN ('completed', 'shipped', 'delivered') THEN 'Fulfilled'
WHEN status IN ('pending', 'processing') THEN 'In Progress'
ELSE 'Other'
END as order_status
FROM raw.orders
WHERE order_date >= '2024-01-01';
-- LINEAGE: raw.orders -> staging.stg_orders
CREATE VIEW staging.stg_products AS
SELECT
product_id,
product_name,
category,
subcategory,
brand,
price,
cost,
(price - cost) / NULLIF(price, 0) * 100 as margin_percent
FROM raw.products
WHERE price > 0;
-- LINEAGE: raw.products -> staging.stg_products
CREATE VIEW staging.stg_order_items AS
SELECT
order_item_id,
order_id,
product_id,
quantity,
unit_price,
discount_percent,
quantity * unit_price * (1 - discount_percent/100) as line_total
FROM raw.order_items;
-- LINEAGE: raw.order_items -> staging.stg_order_items
-- ============================================
-- INTERMEDIATE LAYER - Business logic
-- ============================================
CREATE TABLE intermediate.int_customer_orders AS
SELECT
c.customer_id,
c.email,
c.first_name,
c.last_name,
c.signup_date,
c.country,
c.segment,
COUNT(DISTINCT o.order_id) as total_orders,
SUM(o.total_amount) as total_spent,
MIN(o.order_date) as first_order_date,
MAX(o.order_date) as last_order_date,
AVG(o.total_amount) as avg_order_value
FROM staging.stg_customers c
LEFT JOIN staging.stg_orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.email, c.first_name, c.last_name,
c.signup_date, c.country, c.segment;
-- LINEAGE: staging.stg_customers, staging.stg_orders -> intermediate.int_customer_orders
CREATE TABLE intermediate.int_order_details AS
SELECT
o.order_id,
o.customer_id,
o.order_date,
o.order_status,
oi.product_id,
p.product_name,
p.category,
p.brand,
oi.quantity,
oi.unit_price,
oi.line_total,
p.margin_percent
FROM staging.stg_orders o
JOIN staging.stg_order_items oi ON o.order_id = oi.order_id
JOIN staging.stg_products p ON oi.product_id = p.product_id;
-- LINEAGE: staging.stg_orders, staging.stg_order_items, staging.stg_products -> intermediate.int_order_details
-- ============================================
-- MARTS LAYER - Dimensional model
-- ============================================
CREATE TABLE marts.dim_customers AS
SELECT
customer_id,
email,
first_name || ' ' || last_name as full_name,
signup_date,
country,
segment,
total_orders,
total_spent,
first_order_date,
last_order_date,
avg_order_value,
CASE
WHEN total_spent > 10000 THEN 'Platinum'
WHEN total_spent > 5000 THEN 'Gold'
WHEN total_spent > 1000 THEN 'Silver'
ELSE 'Bronze'
END as customer_tier,
DATEDIFF(day, signup_date, first_order_date) as days_to_first_order
FROM intermediate.int_customer_orders;
-- LINEAGE: intermediate.int_customer_orders -> marts.dim_customers
CREATE TABLE marts.dim_products AS
SELECT
product_id,
product_name,
category,
subcategory,
brand,
price,
cost,
margin_percent,
CASE
WHEN margin_percent > 50 THEN 'High Margin'
WHEN margin_percent > 25 THEN 'Medium Margin'
ELSE 'Low Margin'
END as margin_tier
FROM staging.stg_products;
-- LINEAGE: staging.stg_products -> marts.dim_products
CREATE TABLE marts.fct_orders AS
SELECT
od.order_id,
od.customer_id,
od.product_id,
od.order_date,
od.order_status,
od.quantity,
od.unit_price,
od.line_total,
od.margin_percent,
dc.customer_tier,
dp.margin_tier,
dp.category as product_category
FROM intermediate.int_order_details od
JOIN marts.dim_customers dc ON od.customer_id = dc.customer_id
JOIN marts.dim_products dp ON od.product_id = dp.product_id;
-- LINEAGE: intermediate.int_order_details, marts.dim_customers, marts.dim_products -> marts.fct_orders
-- ============================================
-- REPORTING LAYER - Analytics views
-- ============================================
CREATE VIEW reporting.rpt_daily_sales AS
SELECT
order_date,
product_category,
COUNT(DISTINCT order_id) as num_orders,
SUM(quantity) as units_sold,
SUM(line_total) as gross_revenue,
AVG(line_total) as avg_order_value
FROM marts.fct_orders
GROUP BY order_date, product_category;
-- LINEAGE: marts.fct_orders -> reporting.rpt_daily_sales
CREATE VIEW reporting.rpt_customer_ltv AS
SELECT
customer_id,
full_name,
customer_tier,
country,
total_orders,
total_spent as lifetime_value,
avg_order_value,
days_to_first_order,
DATEDIFF(day, first_order_date, last_order_date) as customer_lifespan_days,
total_spent / NULLIF(DATEDIFF(month, first_order_date, last_order_date), 0) as monthly_value
FROM marts.dim_customers
WHERE total_orders > 0;
-- LINEAGE: marts.dim_customers -> reporting.rpt_customer_ltv
CREATE VIEW reporting.rpt_product_performance AS
SELECT
dp.product_id,
dp.product_name,
dp.category,
dp.brand,
dp.margin_tier,
COUNT(DISTINCT fo.order_id) as times_ordered,
SUM(fo.quantity) as total_units_sold,
SUM(fo.line_total) as total_revenue,
AVG(fo.margin_percent) as avg_margin
FROM marts.dim_products dp
LEFT JOIN marts.fct_orders fo ON dp.product_id = fo.product_id
GROUP BY dp.product_id, dp.product_name, dp.category, dp.brand, dp.margin_tier;
-- LINEAGE: marts.dim_products, marts.fct_orders -> reporting.rpt_product_performance
-- ============================================
-- SUMMARY: Lineage Flow
-- ============================================
-- raw.customers -> staging.stg_customers -> intermediate.int_customer_orders -> marts.dim_customers -> reporting.rpt_customer_ltv
-- raw.orders -> staging.stg_orders -> intermediate.int_customer_orders
-- raw.orders -> staging.stg_orders -> intermediate.int_order_details -> marts.fct_orders -> reporting.rpt_daily_sales
-- raw.products -> staging.stg_products -> intermediate.int_order_details
-- raw.products -> staging.stg_products -> marts.dim_products -> marts.fct_orders
-- raw.order_items -> staging.stg_order_items -> intermediate.int_order_details