OCR_Dashboard / data.sql
tyasnk's picture
Upload folder using huggingface_hub
34844c6 verified
WITH dedup_master_files AS (
SELECT *
FROM (
SELECT
*,
LOWER(
REGEXP_REPLACE(original_filename, r'^(Copy of\s*)+', '')
) AS cleaned_filename,
ROW_NUMBER() OVER (
PARTITION BY original_filename
ORDER BY created_at DESC
) AS rn
FROM `stellar-orb-451904-d9.raw_data_ocr_kia.master_files`
)
WHERE rn = 1
),
dedup_page1_cover AS (
SELECT *
FROM (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY file_id
ORDER BY created_at DESC
) AS rn
FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page1_cover`
)
WHERE rn = 1
),
dedup_page2_identitas AS (
SELECT *
FROM (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY file_id
ORDER BY inserted_at DESC
) AS rn
FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page2_identitas`
)
WHERE rn = 1
),
label AS (
SELECT
filename AS filename_label,
project_id_mother AS project_id_mother_label,
CASE
WHEN first_name IS NOT NULL
AND last_name IS NOT NULL
AND first_name = last_name
THEN first_name
ELSE CONCAT(
IFNULL(first_name, ''),
IFNULL(
CONCAT(' ', last_name),
''
)
)
END AS nama_ibu_cover_label,
puskesmas_name AS dikeluarkan_oleh_fasilitas_kesehatan_label,
regency_name AS kabupaten_kota_label,
CASE
WHEN first_name IS NOT NULL
AND last_name IS NOT NULL
AND first_name = last_name
THEN first_name
ELSE CONCAT(
IFNULL(first_name, ''),
IFNULL(
CONCAT(' ', last_name),
''
)
)
END AS nama_ibu,
nik_mother AS nik_ibu_label,
birth_date AS tempat_tanggal_lahir_ibu_label,
address_street AS alamat_rumah_ibu_label,
contact_number AS telepon_ibu_label,
bpjs_mother AS no_jkn_ibu_label,
education_level AS pendidikan_ibu_label,
occupation AS pekerjaan_ibu_label,
blood_type_result AS golongan_darah_ibu_label,
age AS usia_ibu_label,
pregnancy_number AS kehamilan_ke_label,
number_live_birth AS jumlah_anak_lahir_hidup_label,
number_birth_lost AS riwayat_keguguran_label,
previous_preg_issue AS riwayat_penyakit_ibu_label
FROM (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY `filename`
ORDER BY `end` DESC
) AS rn
FROM `stellar-orb-451904-d9.ocr_bukukia_ocr_sid.List Correct Entry 2025`
)
WHERE
rn = 1
)
,
main AS (
SELECT
MF.id,
MF.cleaned_filename AS `filename`,
REGEXP_EXTRACT(MF.original_filename, r'(7000\d{4,})') AS project_id_mother,
P1.nama_ibu_cover,
P1.dikeluarkan_oleh_fasilitas_kesehatan,
P1.kabupaten_kota,
P2.nama_ibu,
P2.nik_ibu,
P2.tempat_tanggal_lahir_ibu,
P2.alamat_rumah_ibu,
P2.telepon_ibu,
P2.no_jkn_ibu,
P2.pendidikan_ibu,
P2.pekerjaan_ibu,
P2.golongan_darah_ibu,
P2.usia_ibu,
P2.kehamilan_ke,
P2.jumlah_anak_lahir_hidup,
P2.riwayat_keguguran,
P2.riwayat_penyakit_ibu,
L.*,
ROW_NUMBER() OVER (
PARTITION BY MF.`cleaned_filename`
ORDER BY MF.`cleaned_filename` DESC
) AS rn
FROM dedup_master_files MF
LEFT JOIN dedup_page1_cover P1 ON MF.id = P1.id
LEFT JOIN dedup_page2_identitas P2 ON MF.id = P2.id
LEFT JOIN label L ON CAST(REGEXP_EXTRACT(MF.original_filename, r'(7000\d{4,})') AS INTEGER) = L.project_id_mother_label
)
SELECT *
FROM main
WHERE
rn = 1 and
id != "9a21ef00-6b02-4b04-81c3-68e25e2c8b7f" --duplicated