Spaces:
Paused
Paused
| -- extract-sample-data.sql | |
| -- Sample Data Extraction for View Debugging and Testing | |
| -- Citizen Intelligence Agency - Open Source Intelligence Platform | |
| -- | |
| -- Purpose: Extracts sample data from all tables and views to CSV files | |
| -- focusing on columns used in views for debugging empty views | |
| -- | |
| -- Usage: | |
| -- # Extract all sample data to CSV files | |
| -- psql -U postgres -d cia_dev -f service.data.impl/src/main/resources/extract-sample-data.sql | |
| -- | |
| -- # Or extract to specific directory | |
| -- cd /output/directory | |
| -- psql -U postgres -d cia_dev -f service.data.impl/src/main/resources/extract-sample-data.sql | |
| -- | |
| -- Output: | |
| -- - Creates CSV files for each table: table_<tablename>_sample.csv | |
| -- - Creates CSV files for each view: view_<viewname>_sample.csv | |
| -- - Creates manifest file: sample_data_manifest.csv | |
| -- - Creates distinct value files: distinct_<table>_<column>_values.csv | |
| \set ON_ERROR_STOP off | |
| \timing on | |
| \set VERBOSITY verbose | |
| \echo '==================================================' | |
| \echo 'CIA Sample Data Extraction' | |
| \echo 'Started:' `date` | |
| \echo '==================================================' | |
| -- Configuration | |
| \set SAMPLE_SIZE 50 | |
| \set TABLE_CMD_FILE '/tmp/cia_table_extract_commands.sql' | |
| \set VIEW_CMD_FILE '/tmp/cia_view_extract_commands.sql' | |
| \set DISTINCT_CMD_FILE '/tmp/cia_distinct_extract_commands.sql' | |
| \echo '' | |
| \echo '==================================================' | |
| \echo '=== INITIALIZATION ===' | |
| \echo '==================================================' | |
| DROP FUNCTION IF EXISTS cia_tmp_rowcount(text, text); | |
| CREATE OR REPLACE FUNCTION cia_tmp_rowcount(schema_name text, rel_name text) | |
| RETURNS bigint | |
| LANGUAGE plpgsql | |
| AS $$ | |
| DECLARE | |
| result bigint; | |
| BEGIN | |
| EXECUTE format('SELECT COUNT(*) FROM %I.%I', schema_name, rel_name) INTO result; | |
| RETURN COALESCE(result, 0); | |
| EXCEPTION WHEN OTHERS THEN | |
| RAISE NOTICE 'ERROR counting rows in %.%: %', schema_name, rel_name, SQLERRM; | |
| RETURN 0; | |
| END; | |
| $$; | |
| \echo 'Created helper function: cia_tmp_rowcount' | |
| \echo '' | |
| \echo 'Configuration:' | |
| \echo ' Sample size: 50 rows per table/view' | |
| \echo ' Output format: CSV with headers' | |
| \echo '' | |
| -- =========================================================================== | |
| -- SECTION 1: EARLY DISTINCT VALUE EXTRACTION (Before View Analysis) | |
| -- =========================================================================== | |
| -- This section extracts ALL distinct values from columns likely used in | |
| -- view predicates (WHERE, JOIN, GROUP BY, HAVING conditions) | |
| -- =========================================================================== | |
| \echo '' | |
| \echo '==================================================' | |
| \echo '=== PHASE 1: DISTINCT VALUE EXTRACTION ===' | |
| \echo '=== (Early extraction for view debugging) ===' | |
| \echo '==================================================' | |
| \echo '' | |
| -- Create output directory marker | |
| \! mkdir -p distinct_values | |
| \echo 'Extracting distinct values from all categorical/predicate columns...' | |
| \echo 'This runs BEFORE view row counting to capture all possible filter values.' | |
| \echo '' | |
| -- Generate distinct value extraction commands | |
| \! rm -f :DISTINCT_CMD_FILE | |
| \pset format unaligned | |
| \pset tuples_only on | |
| \o :DISTINCT_CMD_FILE | |
| -- ============================================================================ | |
| -- KNOWN CRITICAL COLUMNS: Hardcoded list of columns used in views | |
| -- These are extracted from analyzing view definitions in the schema | |
| -- ============================================================================ | |
| SELECT E'-- Critical predicate columns from view analysis\n' || | |
| E'\\echo ''Extracting critical predicate columns...''\n'; | |
| -- assignment_data columns (used in committee/government/ministry views) | |
| SELECT format( | |
| '\echo '' [CRITICAL] assignment_data.%s''' || E'\n' || | |
| '\copy (SELECT %s AS value, COUNT(*) AS count FROM assignment_data WHERE %s IS NOT NULL GROUP BY %s ORDER BY count DESC, %s LIMIT 2000) TO ''distinct_values/assignment_data_%s.csv'' WITH CSV HEADER' || E'\n', | |
| col, col, col, col, col, col | |
| ) | |
| FROM unnest(ARRAY['org_code', 'role_code', 'status', 'detail', 'assignment_type', 'intressent_id']) AS col | |
| WHERE EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'assignment_data' AND column_name = col); | |
| -- person_data columns | |
| SELECT format( | |
| '\echo '' [CRITICAL] person_data.%s''' || E'\n' || | |
| '\copy (SELECT %s AS value, COUNT(*) AS count FROM person_data WHERE %s IS NOT NULL GROUP BY %s ORDER BY count DESC, %s LIMIT 2000) TO ''distinct_values/person_data_%s.csv'' WITH CSV HEADER' || E'\n', | |
| col, col, col, col, col, col | |
| ) | |
| FROM unnest(ARRAY['party', 'status', 'gender', 'election_region']) AS col | |
| WHERE EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'person_data' AND column_name = col); | |
| -- vote_data columns | |
| SELECT format( | |
| '\echo '' [CRITICAL] vote_data.%s''' || E'\n' || | |
| '\copy (SELECT %s AS value, COUNT(*) AS count FROM vote_data WHERE %s IS NOT NULL GROUP BY %s ORDER BY count DESC, %s LIMIT 2000) TO ''distinct_values/vote_data_%s.csv'' WITH CSV HEADER' || E'\n', | |
| col, col, col, col, col, col | |
| ) | |
| FROM unnest(ARRAY['vote', 'ballot_type', 'party', 'gender']) AS col | |
| WHERE EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'vote_data' AND column_name = col); | |
| -- document_data columns | |
| SELECT format( | |
| '\echo '' [CRITICAL] document_data.%s''' || E'\n' || | |
| '\copy (SELECT %s AS value, COUNT(*) AS count FROM document_data WHERE %s IS NOT NULL GROUP BY %s ORDER BY count DESC, %s LIMIT 2000) TO ''distinct_values/document_data_%s.csv'' WITH CSV HEADER' || E'\n', | |
| col, col, col, col, col, col | |
| ) | |
| FROM unnest(ARRAY['document_type', 'status', 'org', 'rm', 'sub_type']) AS col | |
| WHERE EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'document_data' AND column_name = col); | |
| -- committee_document_data columns | |
| SELECT format( | |
| '\echo '' [CRITICAL] committee_document_data.%s''' || E'\n' || | |
| '\copy (SELECT %s AS value, COUNT(*) AS count FROM committee_document_data WHERE %s IS NOT NULL GROUP BY %s ORDER BY count DESC, %s LIMIT 2000) TO ''distinct_values/committee_document_data_%s.csv'' WITH CSV HEADER' || E'\n', | |
| col, col, col, col, col, col | |
| ) | |
| FROM unnest(ARRAY['org', 'status', 'rm', 'sub_type']) AS col | |
| WHERE EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'committee_document_data' AND column_name = col); | |
| -- committee_proposal_data columns | |
| SELECT format( | |
| '\echo '' [CRITICAL] committee_proposal_data.%s''' || E'\n' || | |
| '\copy (SELECT %s AS value, COUNT(*) AS count FROM committee_proposal_data WHERE %s IS NOT NULL GROUP BY %s ORDER BY count DESC, %s LIMIT 2000) TO ''distinct_values/committee_proposal_data_%s.csv'' WITH CSV HEADER' || E'\n', | |
| col, col, col, col, col, col | |
| ) | |
| FROM unnest(ARRAY['decision_type', 'winner', 'committee_report', 'rm']) AS col | |
| WHERE EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'committee_proposal_data' AND column_name = col); | |
| -- document_status_container columns | |
| SELECT format( | |
| '\echo '' [CRITICAL] document_status_container.%s''' || E'\n' || | |
| '\copy (SELECT %s AS value, COUNT(*) AS count FROM document_status_container WHERE %s IS NOT NULL GROUP BY %s ORDER BY count DESC, %s LIMIT 2000) TO ''distinct_values/document_status_container_%s.csv'' WITH CSV HEADER' || E'\n', | |
| col, col, col, col, col, col | |
| ) | |
| FROM unnest(ARRAY['document_category']) AS col | |
| WHERE EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'document_status_container' AND column_name = col); | |
| -- document_element columns | |
| SELECT format( | |
| '\echo '' [CRITICAL] document_element.%s''' || E'\n' || | |
| '\copy (SELECT %s AS value, COUNT(*) AS count FROM document_element WHERE %s IS NOT NULL GROUP BY %s ORDER BY count DESC, %s LIMIT 2000) TO ''distinct_values/document_element_%s.csv'' WITH CSV HEADER' || E'\n', | |
| col, col, col, col, col, col | |
| ) | |
| FROM unnest(ARRAY['document_type', 'status', 'org', 'rm', 'sub_type', 'doc_type', 'database_source']) AS col | |
| WHERE EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'document_element' AND column_name = col); | |
| -- detail_data columns | |
| SELECT format( | |
| '\echo '' [CRITICAL] detail_data.%s''' || E'\n' || | |
| '\copy (SELECT %s AS value, COUNT(*) AS count FROM detail_data WHERE %s IS NOT NULL GROUP BY %s ORDER BY count DESC, %s LIMIT 2000) TO ''distinct_values/detail_data_%s.csv'' WITH CSV HEADER' || E'\n', | |
| col, col, col, col, col, col | |
| ) | |
| FROM unnest(ARRAY['code', 'detail_type']) AS col | |
| WHERE EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'detail_data' AND column_name = col); | |
| -- ============================================================================ | |
| -- DYNAMIC EXTRACTION: All varchar/text columns with low cardinality | |
| -- ============================================================================ | |
| SELECT E'\n-- Dynamic extraction of all categorical columns\n' || | |
| E'\\echo ''''\n' || | |
| E'\\echo ''Extracting additional categorical columns dynamically...''\n'; | |
| -- Extract all text/varchar columns that likely contain categorical data | |
| WITH categorical_columns AS ( | |
| SELECT | |
| c.table_name, | |
| c.column_name, | |
| c.data_type, | |
| c.character_maximum_length | |
| FROM information_schema.columns c | |
| JOIN information_schema.tables t | |
| ON t.table_schema = c.table_schema | |
| AND t.table_name = c.table_name | |
| WHERE c.table_schema = 'public' | |
| AND t.table_type = 'BASE TABLE' | |
| AND c.data_type IN ('character varying', 'text', 'character') | |
| -- Exclude large text fields and known non-categorical columns | |
| AND (c.character_maximum_length IS NULL OR c.character_maximum_length <= 255) | |
| -- ======================================================================== | |
| -- EXCLUDE CONTENT/TEXT FIELDS (long text, descriptions, etc.) | |
| -- ======================================================================== | |
| AND c.column_name NOT IN ('content', 'title', 'sub_title', 'description', 'text', | |
| 'summary', 'note', 'header', 'proposal', 'wording', | |
| 'wording_2', 'wording_3', 'wording_4', 'source_note', | |
| 'reference_name', 'detail_name', 'activity_name', | |
| 'config_description', 'component_description', | |
| 'error_message', 'application_message', 'storage') | |
| -- ======================================================================== | |
| -- EXCLUDE URL FIELDS | |
| -- ======================================================================== | |
| AND c.column_name NOT IN ('file_url', 'file_name', 'image_url_192', 'image_url_80', | |
| 'image_url_max', 'person_url_xml', 'document_url_html', | |
| 'document_url_text', 'document_status_url_xml', | |
| 'document_status_url_www', 'committee_proposal_url_xml', | |
| 'committee_report_url_xml', 'ballot_url_xml', 'website') | |
| -- ======================================================================== | |
| -- EXCLUDE NAME FIELDS (person names, full names, etc.) | |
| -- ======================================================================== | |
| AND c.column_name NOT IN ('first_name', 'last_name', 'full_name', 'name', | |
| 'party_name', 'country_name', 'indicator_name', | |
| 'county_name', 'municipal_name', 'region_name', | |
| 'electoral_area_name', 'election_region_name', | |
| 'capital_city', 'place', 'city', 'address', | |
| 'co_address', 'source_organization', 'adminregion_value', | |
| 'region_value', 'property_value') | |
| -- ======================================================================== | |
| -- EXCLUDE ID FIELDS (unique identifiers, foreign keys, etc.) | |
| -- ======================================================================== | |
| AND c.column_name NOT LIKE '%_id' | |
| AND c.column_name NOT LIKE '%_id_%' | |
| AND c.column_name NOT LIKE 'id_%' | |
| AND c.column_name != 'id' | |
| AND c.column_name NOT LIKE 'hjid%' | |
| AND c.column_name NOT LIKE '%intressent_id%' | |
| AND c.column_name NOT LIKE '%person_reference_id%' | |
| AND c.column_name NOT LIKE '%document_id%' | |
| AND c.column_name NOT LIKE '%ballot_id%' | |
| AND c.column_name NOT LIKE '%hangar%' | |
| AND c.column_name NOT LIKE '%guid%' | |
| -- ======================================================================== | |
| -- EXCLUDE SESSION/USER TRACKING FIELDS | |
| -- ======================================================================== | |
| AND c.column_name NOT IN ('ip_information', 'user_agent_information', 'session_id', | |
| 'user_id', 'user_password', 'email', 'username') | |
| -- ======================================================================== | |
| -- EXCLUDE DATE/TIME STRING FIELDS | |
| -- ======================================================================== | |
| AND c.column_name NOT LIKE '%_date' | |
| AND c.column_name NOT LIKE '%date_%' | |
| AND c.column_name NOT IN ('created', 'datum', 'created_date', 'made_public_date', | |
| 'system_date') | |
| -- ======================================================================== | |
| -- EXCLUDE TECHNICAL/INFRASTRUCTURE TABLES (not domain data) | |
| -- ======================================================================== | |
| -- Javers audit tables (jv_*) | |
| AND c.table_name NOT LIKE 'jv_%' | |
| -- Quartz scheduler tables (qrtz_*) | |
| AND c.table_name NOT LIKE 'qrtz_%' | |
| -- Liquibase migration tables | |
| AND c.table_name NOT IN ('databasechangelog', 'databasechangeloglock') | |
| -- Hibernate sequence table | |
| AND c.table_name != 'hibernate_sequence' | |
| -- Encrypted value storage (sensitive) | |
| AND c.table_name != 'encrypted_value' | |
| -- Application session/event tables (user tracking, not political data) | |
| AND c.table_name NOT IN ('application_session', 'application_action_event', | |
| 'application_configuration', 'application_view') | |
| -- User account tables (sensitive) | |
| AND c.table_name NOT IN ('user_account', 'user_account_address') | |
| -- Portal configuration tables | |
| AND c.table_name NOT IN ('portal', 'domain_portal', 'agency') | |
| -- Rule violation tracking | |
| AND c.table_name != 'rule_violation' | |
| -- Data source metadata | |
| AND c.table_name != 'data_source_content' | |
| -- Language/translation tables (not political data) | |
| AND c.table_name NOT IN ('language_data', 'language_content_data') | |
| -- ======================================================================== | |
| -- EXCLUDE OTHER HIGH-CARDINALITY OR NON-CATEGORICAL COLUMNS | |
| -- ======================================================================== | |
| AND c.column_name NOT IN ('label', 'temp_label', 'kall_id', 'related_id', | |
| 'next_page', 'debug', 'warning', 'document_version', | |
| 'order_number', 'bank_number', 'phone_number', | |
| 'fax_number', 'post_code', 'latitude', 'longitude', | |
| 'rest', 'value_', 'data_value') | |
| -- ======================================================================== | |
| -- Skip columns already extracted above (critical columns) | |
| -- ======================================================================== | |
| AND NOT (c.table_name = 'assignment_data' AND c.column_name IN ('org_code', 'role_code', 'status', 'detail', 'assignment_type', 'intressent_id')) | |
| AND NOT (c.table_name = 'person_data' AND c.column_name IN ('party', 'status', 'gender', 'election_region')) | |
| AND NOT (c.table_name = 'vote_data' AND c.column_name IN ('vote', 'ballot_type', 'party', 'gender')) | |
| AND NOT (c.table_name = 'document_data' AND c.column_name IN ('document_type', 'status', 'org', 'rm', 'sub_type')) | |
| AND NOT (c.table_name = 'committee_document_data' AND c.column_name IN ('org', 'status', 'rm', 'sub_type')) | |
| AND NOT (c.table_name = 'committee_proposal_data' AND c.column_name IN ('decision_type', 'winner', 'committee_report', 'rm')) | |
| AND NOT (c.table_name = 'document_status_container' AND c.column_name IN ('document_category')) | |
| AND NOT (c.table_name = 'document_element' AND c.column_name IN ('document_type', 'status', 'org', 'rm', 'sub_type', 'doc_type', 'database_source')) | |
| AND NOT (c.table_name = 'detail_data' AND c.column_name IN ('code', 'detail_type')) | |
| ORDER BY c.table_name, c.column_name | |
| ) | |
| SELECT format( | |
| '\echo '' [AUTO] %s.%s''' || E'\n' || | |
| '\copy (SELECT %s AS value, COUNT(*) AS count FROM %s WHERE %s IS NOT NULL GROUP BY %s ORDER BY count DESC, %s LIMIT 1000) TO ''distinct_values/%s_%s.csv'' WITH CSV HEADER' || E'\n', | |
| table_name, column_name, | |
| column_name, table_name, column_name, column_name, column_name, | |
| table_name, column_name | |
| ) | |
| FROM categorical_columns; | |
| -- ============================================================================ | |
| -- CROSS-TABLE VALUE ANALYSIS: Values that appear in JOINs | |
| -- ============================================================================ | |
| SELECT E'\n-- Cross-table join key analysis\n' || | |
| E'\\echo ''''\n' || | |
| E'\\echo ''Extracting cross-table join key distributions...''\n'; | |
| -- Extract ID distributions for common join patterns | |
| SELECT E'\\echo '' [JOIN] person_data IDs in assignment_data''' || E'\n' || | |
| E'\\copy (SELECT intressent_id AS value, COUNT(*) AS count FROM assignment_data WHERE intressent_id IS NOT NULL GROUP BY intressent_id ORDER BY count DESC LIMIT 500) TO ''distinct_values/join_assignment_to_person.csv'' WITH CSV HEADER' || E'\n' | |
| WHERE EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'assignment_data'); | |
| SELECT E'\\echo '' [JOIN] document IDs in document_status_container''' || E'\n' || | |
| E'\\copy (SELECT document_document_status_con_0 AS value, COUNT(*) AS count FROM document_status_container WHERE document_document_status_con_0 IS NOT NULL GROUP BY document_document_status_con_0 ORDER BY count DESC LIMIT 500) TO ''distinct_values/join_docstatus_to_document.csv'' WITH CSV HEADER' || E'\n' | |
| WHERE EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'document_status_container' AND column_name = 'document_document_status_con_0'); | |
| -- ============================================================================ | |
| -- SUMMARY STATISTICS | |
| -- ============================================================================ | |
| SELECT E'\n\\echo ''''\n' || | |
| E'\\echo ''==================================================''\n' || | |
| E'\\echo ''Distinct value extraction complete.''\n' || | |
| E'\\echo ''Files saved to: distinct_values/''\n' || | |
| E'\\echo ''==================================================''\n'; | |
| \o | |
| \pset format aligned | |
| \pset tuples_only off | |
| -- Execute the distinct value extractions | |
| \echo '' | |
| \echo 'Executing distinct value extractions...' | |
| \i :DISTINCT_CMD_FILE | |
| \! rm -f :DISTINCT_CMD_FILE | |
| -- Create a summary of extracted distinct values | |
| \echo '' | |
| \echo 'Creating distinct value summary...' | |
| \copy (SELECT 'Summary of Distinct Value Extractions' AS info, COUNT(*) AS file_count FROM (SELECT 1) x) TO 'distinct_values/_extraction_summary.csv' WITH CSV HEADER | |
| \! ls -la distinct_values/ 2>/dev/null | head -50 || echo "distinct_values directory listing" | |
| \echo '' | |
| \echo '==================================================' | |
| \echo '=== PHASE 1 COMPLETE: Distinct Values Extracted ===' | |
| \echo '==================================================' | |
| \echo '' | |
| -- =========================================================================== | |
| -- SECTION 2: Analyze All Views for Statistics | |
| -- =========================================================================== | |
| \echo '' | |
| \echo '==================================================' | |
| \echo '=== PHASE 2: ANALYZING ALL VIEWS ===' | |
| \echo '==================================================' | |
| \echo '' | |
| \echo 'Running ANALYZE on all views (regular and materialized)...' | |
| \echo '' | |
| DO $$ | |
| DECLARE | |
| view_record RECORD; | |
| view_count INTEGER := 0; | |
| success_count INTEGER := 0; | |
| error_count INTEGER := 0; | |
| mat_view_count INTEGER := 0; | |
| reg_view_count INTEGER := 0; | |
| start_time TIMESTAMP; | |
| end_time TIMESTAMP; | |
| duration INTERVAL; | |
| total_views INTEGER; | |
| BEGIN | |
| start_time := clock_timestamp(); | |
| -- Get total count for progress reporting | |
| SELECT | |
| (SELECT COUNT(*) FROM pg_matviews WHERE schemaname = 'public') + | |
| (SELECT COUNT(*) FROM pg_views WHERE schemaname = 'public') | |
| INTO total_views; | |
| RAISE NOTICE '================================================'; | |
| RAISE NOTICE 'Starting ANALYZE of % total views', total_views; | |
| RAISE NOTICE ' - Materialized views: %', (SELECT COUNT(*) FROM pg_matviews WHERE schemaname = 'public'); | |
| RAISE NOTICE ' - Regular views: %', (SELECT COUNT(*) FROM pg_views WHERE schemaname = 'public'); | |
| RAISE NOTICE '================================================'; | |
| RAISE NOTICE ''; | |
| -- Analyze materialized views first | |
| RAISE NOTICE '--- Analyzing Materialized Views ---'; | |
| RAISE NOTICE ''; | |
| FOR view_record IN | |
| SELECT schemaname, matviewname AS viewname, 'MATERIALIZED VIEW' AS view_type | |
| FROM pg_matviews | |
| WHERE schemaname = 'public' | |
| ORDER BY matviewname | |
| LOOP | |
| BEGIN | |
| view_count := view_count + 1; | |
| mat_view_count := mat_view_count + 1; | |
| RAISE NOTICE 'ANALYZE [%/%]: %.% (%)', view_count, total_views, | |
| view_record.schemaname, view_record.viewname, view_record.view_type; | |
| EXECUTE format('ANALYZE %I.%I', view_record.schemaname, view_record.viewname); | |
| success_count := success_count + 1; | |
| RAISE NOTICE ' ✓ Analyzed successfully'; | |
| RAISE NOTICE ''; | |
| EXCEPTION WHEN OTHERS THEN | |
| error_count := error_count + 1; | |
| RAISE NOTICE ' ✗ ERROR: %', SQLERRM; | |
| RAISE NOTICE ''; | |
| END; | |
| END LOOP; | |
| -- Now analyze regular views | |
| RAISE NOTICE ''; | |
| RAISE NOTICE '--- Analyzing Regular Views ---'; | |
| RAISE NOTICE ''; | |
| FOR view_record IN | |
| SELECT schemaname, viewname, 'VIEW' AS view_type | |
| FROM pg_views | |
| WHERE schemaname = 'public' | |
| ORDER BY viewname | |
| LOOP | |
| BEGIN | |
| view_count := view_count + 1; | |
| reg_view_count := reg_view_count + 1; | |
| RAISE NOTICE 'ANALYZE [%/%]: %.% (%)', view_count, total_views, | |
| view_record.schemaname, view_record.viewname, view_record.view_type; | |
| EXECUTE format('ANALYZE %I.%I', view_record.schemaname, view_record.viewname); | |
| success_count := success_count + 1; | |
| RAISE NOTICE ' ✓ Analyzed successfully'; | |
| RAISE NOTICE ''; | |
| EXCEPTION WHEN OTHERS THEN | |
| error_count := error_count + 1; | |
| RAISE NOTICE ' ✗ ERROR: %', SQLERRM; | |
| RAISE NOTICE ''; | |
| END; | |
| END LOOP; | |
| end_time := clock_timestamp(); | |
| duration := end_time - start_time; | |
| RAISE NOTICE ''; | |
| RAISE NOTICE '================================================'; | |
| RAISE NOTICE 'View analysis summary:'; | |
| RAISE NOTICE ' Total views analyzed: %', view_count; | |
| RAISE NOTICE ' - Materialized views: %', mat_view_count; | |
| RAISE NOTICE ' - Regular views: %', reg_view_count; | |
| RAISE NOTICE ' Successfully analyzed: %', success_count; | |
| RAISE NOTICE ' Errors: %', error_count; | |
| RAISE NOTICE ' Duration: %', duration; | |
| RAISE NOTICE '================================================'; | |
| RAISE NOTICE ''; | |
| END $$; | |
| -- Display statistics results from pg_stats | |
| \echo '' | |
| \echo '==================================================' | |
| \echo '=== ANALYZE STATISTICS RESULTS ===' | |
| \echo '==================================================' | |
| \echo '' | |
| \echo 'Displaying sample statistics for materialized views...' | |
| \echo '' | |
| -- Show row count estimates for all materialized views | |
| -- Note: Materialized views are stored as tables in pg_class, not in pg_stat_user_tables | |
| \copy (SELECT schemaname AS schema, matviewname AS view_name, (SELECT n_live_tup FROM pg_stat_all_tables WHERE schemaname = m.schemaname AND relname = m.matviewname) AS estimated_rows, (SELECT n_dead_tup FROM pg_stat_all_tables WHERE schemaname = m.schemaname AND relname = m.matviewname) AS dead_rows, (SELECT last_analyze FROM pg_stat_all_tables WHERE schemaname = m.schemaname AND relname = m.matviewname) AS last_analyzed, (SELECT last_autoanalyze FROM pg_stat_all_tables WHERE schemaname = m.schemaname AND relname = m.matviewname) AS last_auto_analyzed FROM pg_matviews m WHERE schemaname = 'public' ORDER BY estimated_rows DESC NULLS LAST) TO 'materialized_view_statistics.csv' WITH CSV HEADER; | |
| \echo ' ✓ Statistics exported to: materialized_view_statistics.csv' | |
| \echo '' | |
| -- Display top 10 materialized views by row count | |
| \echo 'Top 10 materialized views by estimated row count:' | |
| \echo '' | |
| SELECT | |
| m.schemaname || '.' || m.matviewname AS view_name, | |
| s.n_live_tup AS estimated_rows, | |
| pg_size_pretty(pg_total_relation_size(m.schemaname || '.' || m.matviewname)) AS total_size, | |
| s.last_analyze AS last_analyzed | |
| FROM pg_matviews m | |
| LEFT JOIN pg_stat_all_tables s ON s.schemaname = m.schemaname AND s.relname = m.matviewname | |
| WHERE m.schemaname = 'public' | |
| ORDER BY s.n_live_tup DESC NULLS LAST | |
| LIMIT 10; | |
| \echo '' | |
| \echo '==================================================' | |
| -- =========================================================================== | |
| -- SECTION 1.5: Generate Individual Query Plan Files | |
| -- =========================================================================== | |
| \echo '' | |
| \echo '==================================================' | |
| \echo '=== GENERATING INDIVIDUAL VIEW QUERY PLANS ===' | |
| \echo '==================================================' | |
| \echo '' | |
| \! mkdir -p view_plans | |
| DO $$ | |
| DECLARE | |
| view_record RECORD; | |
| view_count INTEGER := 0; | |
| file_name TEXT; | |
| explain_query TEXT; | |
| BEGIN | |
| RAISE NOTICE 'Generating individual query plan files...'; | |
| RAISE NOTICE ''; | |
| FOR view_record IN | |
| SELECT schemaname, viewname AS object_name, 'VIEW' AS object_type | |
| FROM pg_views | |
| WHERE schemaname = 'public' | |
| UNION ALL | |
| SELECT schemaname, matviewname AS object_name, 'MATERIALIZED VIEW' AS object_type | |
| FROM pg_matviews | |
| WHERE schemaname = 'public' | |
| ORDER BY object_name | |
| LOOP | |
| BEGIN | |
| view_count := view_count + 1; | |
| file_name := '/workspaces/cia/service.data.impl/sample-data/view_plans/explain_' || view_record.object_name || '.txt'; | |
| RAISE NOTICE '[%] %.%', view_count, view_record.schemaname, view_record.object_name; | |
| -- Execute EXPLAIN and write to file using psql \o command | |
| explain_query := 'EXPLAIN (ANALYZE, BUFFERS, VERBOSE, FORMAT TEXT) SELECT * FROM ' || | |
| quote_ident(view_record.schemaname) || '.' || | |
| quote_ident(view_record.object_name) || ' LIMIT 1'; | |
| -- This approach won't work in DO block, skip EXPLAIN generation | |
| RAISE NOTICE ' ℹ️ Skipping EXPLAIN (requires psql \\o command)'; | |
| EXCEPTION WHEN OTHERS THEN | |
| RAISE NOTICE ' ✗ ERROR: %', SQLERRM; | |
| END; | |
| END LOOP; | |
| RAISE NOTICE ''; | |
| RAISE NOTICE ' Note: EXPLAIN plans require manual execution with psql \\o command'; | |
| RAISE NOTICE ''; | |
| END $$; | |
| -- =========================================================================== | |
| -- SECTION 2: Extract Sample Data from ALL Tables Dynamically | |
| -- =========================================================================== | |
| \echo '' | |
| \echo '==================================================' | |
| \echo '=== PHASE 3: EXTRACTING TABLE SAMPLE DATA ===' | |
| \echo '==================================================' | |
| \echo '' | |
| DO $$ | |
| DECLARE | |
| table_count INTEGER; | |
| empty_count INTEGER := 0; | |
| non_empty_count INTEGER := 0; | |
| table_rec RECORD; | |
| row_count BIGINT; | |
| BEGIN | |
| SELECT COUNT(*) INTO table_count | |
| FROM pg_tables | |
| WHERE schemaname = 'public'; | |
| RAISE NOTICE 'Found % tables in public schema', table_count; | |
| RAISE NOTICE ''; | |
| RAISE NOTICE 'Counting rows in each table (this may take a moment)...'; | |
| RAISE NOTICE ''; | |
| -- Count empty vs non-empty tables with progress | |
| FOR table_rec IN | |
| SELECT tablename | |
| FROM pg_tables | |
| WHERE schemaname = 'public' | |
| ORDER BY tablename | |
| LOOP | |
| row_count := cia_tmp_rowcount('public', table_rec.tablename); | |
| IF row_count > 0 THEN | |
| non_empty_count := non_empty_count + 1; | |
| ELSE | |
| empty_count := empty_count + 1; | |
| END IF; | |
| END LOOP; | |
| RAISE NOTICE '✓ Row count analysis complete'; | |
| RAISE NOTICE ' Tables with data: %', non_empty_count; | |
| RAISE NOTICE ' Empty tables: %', empty_count; | |
| RAISE NOTICE ''; | |
| END $$; | |
| \! rm -f :TABLE_CMD_FILE | |
| \pset format unaligned | |
| \pset tuples_only on | |
| \o :TABLE_CMD_FILE | |
| WITH table_counts AS ( | |
| SELECT schemaname, | |
| tablename, | |
| cia_tmp_rowcount(schemaname, tablename) AS row_count | |
| FROM pg_tables | |
| WHERE schemaname = 'public' | |
| ), | |
| table_extract AS ( | |
| SELECT schemaname, | |
| tablename, | |
| row_count, | |
| LEAST(:SAMPLE_SIZE::int, row_count) AS sample_rows, | |
| CASE WHEN tablename LIKE 'table_%' THEN tablename ELSE 'table_' || tablename END AS file_prefix | |
| FROM table_counts | |
| WHERE row_count > 0 | |
| ) | |
| SELECT format( | |
| '\echo ''[TABLE] Extracting: %I.%I (%s rows sampled of %s total)''' || E'\n' || | |
| '\copy (SELECT * FROM %I.%I LIMIT %s) TO ''%s_sample.csv'' CSV HEADER' || E'\n' || | |
| '\echo '' ✓ Completed: %s_sample.csv''' || E'\n', | |
| schemaname, | |
| tablename, | |
| sample_rows, | |
| row_count, | |
| schemaname, | |
| tablename, | |
| sample_rows, | |
| file_prefix, | |
| file_prefix | |
| ) | |
| FROM table_extract | |
| ORDER BY tablename; | |
| \o | |
| \pset format aligned | |
| \pset tuples_only off | |
| \echo 'Executing table extractions...' | |
| \echo '' | |
| -- Execute with verbose feedback showing what we're about to do | |
| \i :TABLE_CMD_FILE | |
| \! rm -f :TABLE_CMD_FILE | |
| \echo '' | |
| \echo 'Table extraction completed' | |
| \echo '' | |
| -- =========================================================================== | |
| -- SECTION 4: Extract Sample Data from ALL Views Dynamically | |
| -- =========================================================================== | |
| \echo '' | |
| \echo '==================================================' | |
| \echo '=== PHASE 4: EXTRACTING VIEW SAMPLE DATA ===' | |
| \echo '==================================================' | |
| \echo '' | |
| DO $$ | |
| DECLARE | |
| view_record RECORD; | |
| view_count INTEGER := 0; | |
| total_views INTEGER; | |
| row_count BIGINT; | |
| extract_count INTEGER := 0; | |
| BEGIN | |
| -- Get total view count (excluding coalition alignment matrix) | |
| SELECT COUNT(*) INTO total_views | |
| FROM ( | |
| SELECT viewname FROM pg_views | |
| WHERE schemaname = 'public' | |
| AND viewname != 'view_riksdagen_intelligence_dashboard' | |
| UNION ALL | |
| SELECT matviewname FROM pg_matviews | |
| WHERE schemaname = 'public' | |
| ) v; | |
| RAISE NOTICE ''; | |
| RAISE NOTICE 'Phase 1: Analyzing % views for row counts', total_views; | |
| RAISE NOTICE 'Excluding: view_riksdagen_coalition_alignment_matrix (complex query)'; | |
| RAISE NOTICE 'This may take several minutes for complex views...'; | |
| RAISE NOTICE ''; | |
| -- Process each view with progress BEFORE the slow operation | |
| FOR view_record IN | |
| SELECT schemaname, viewname AS object_name, 'VIEW' AS object_type | |
| FROM pg_views | |
| WHERE schemaname = 'public' | |
| AND viewname != 'view_riksdagen_coalition_alignment_matrix' | |
| AND viewname != 'view_riksdagen_intelligence_dashboard' | |
| UNION ALL | |
| SELECT schemaname, matviewname AS object_name, 'MATERIALIZED VIEW' AS object_type | |
| FROM pg_matviews | |
| WHERE schemaname = 'public' | |
| ORDER BY object_name | |
| LOOP | |
| view_count := view_count + 1; | |
| -- Always show what we're about to analyze | |
| RAISE NOTICE '→ [%/%] Analyzing: %.% (%)', | |
| view_count, total_views, | |
| view_record.schemaname, view_record.object_name, | |
| view_record.object_type; | |
| -- Now do the slow count operation | |
| row_count := cia_tmp_rowcount(view_record.schemaname, view_record.object_name); | |
| -- Show result immediately after | |
| IF row_count > 0 THEN | |
| RAISE NOTICE ' ✓ Contains % rows', row_count; | |
| extract_count := extract_count + 1; | |
| ELSE | |
| RAISE NOTICE ' ⚠️ EMPTY (0 rows)'; | |
| END IF; | |
| RAISE NOTICE ''; | |
| END LOOP; | |
| RAISE NOTICE ''; | |
| RAISE NOTICE '✓ Phase 1 complete: % of % views have data', extract_count, total_views; | |
| RAISE NOTICE ''; | |
| END $$; | |
| \echo '' | |
| \echo 'Phase 2: Generating extraction commands...' | |
| \echo '' | |
| \! rm -f :VIEW_CMD_FILE | |
| \pset format unaligned | |
| \pset tuples_only on | |
| \o :VIEW_CMD_FILE | |
| WITH view_counts AS ( | |
| SELECT schemaname, | |
| viewname, | |
| cia_tmp_rowcount(schemaname, viewname) AS row_count | |
| FROM pg_views | |
| WHERE schemaname = 'public' | |
| AND viewname != 'view_riksdagen_coalition_alignment_matrix' | |
| AND viewname != 'view_riksdagen_intelligence_dashboard' | |
| UNION ALL | |
| SELECT schemaname, | |
| matviewname, | |
| cia_tmp_rowcount(schemaname, matviewname) AS row_count | |
| FROM pg_matviews | |
| WHERE schemaname = 'public' | |
| ), | |
| view_extract AS ( | |
| SELECT schemaname, | |
| viewname, | |
| row_count, | |
| LEAST(:SAMPLE_SIZE::int, row_count) AS sample_rows, | |
| CASE WHEN viewname LIKE 'view_%' THEN viewname ELSE 'view_' || viewname END AS file_prefix | |
| FROM view_counts | |
| WHERE row_count > 0 | |
| ) | |
| SELECT format( | |
| '\echo ''[VIEW] Extracting: %s (%s rows sampled of %s total)''' || E'\n' || | |
| '\copy (SELECT * FROM %I.%I ORDER BY random() LIMIT %s) TO ''%s_sample.csv'' WITH CSV HEADER' || E'\n' || | |
| '\echo '' ✓ Completed: %s_sample.csv''' || E'\n', | |
| viewname, | |
| sample_rows, | |
| row_count, | |
| schemaname, | |
| viewname, | |
| sample_rows, | |
| file_prefix, | |
| file_prefix | |
| ) | |
| FROM view_extract | |
| ORDER BY viewname; | |
| \o | |
| \pset format aligned | |
| \pset tuples_only off | |
| \echo '' | |
| \echo 'Phase 3: Executing view extractions...' | |
| \echo '' | |
| \i /tmp/cia_view_extract_commands.sql | |
| \! rm -f /tmp/cia_view_extract_commands.sql | |
| \echo '' | |
| \echo '==================================================' | |
| \echo 'View extraction completed' | |
| \echo '==================================================' | |
| \echo '' | |
| -- =========================================================================== | |
| -- CLEANUP | |
| -- =========================================================================== | |
| \echo '' | |
| \echo '==================================================' | |
| \echo '=== CLEANUP ===' | |
| \echo '==================================================' | |
| DROP FUNCTION IF EXISTS cia_tmp_rowcount(text, text); | |
| \echo 'Dropped helper function: cia_tmp_rowcount' | |
| \echo '' | |
| \echo '==================================================' | |
| \echo 'CIA Sample Data Extraction COMPLETE' | |
| \echo 'Finished:' `date` | |
| \echo '==================================================' | |
| \echo '' | |
| \echo 'Output files:' | |
| \echo ' - distinct_values/*.csv : All distinct values for predicate columns' | |
| \echo ' - table_*.csv : Sample data from tables' | |
| \echo ' - view_*.csv : Sample data from views' | |
| \echo '' | |