diff --git "a/artifacts/evaluation/latest/summary.json" "b/artifacts/evaluation/latest/summary.json" --- "a/artifacts/evaluation/latest/summary.json" +++ "b/artifacts/evaluation/latest/summary.json" @@ -21,7 +21,7 @@ "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": null }, @@ -32,6 +32,11 @@ }, "id": "car-buying-maps-to-automotive-buying", "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, { "actual": null, "expected": "Auto Type", @@ -71,7 +76,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null @@ -84,6 +89,11 @@ }, "id": "labtop-buying-maps-to-laptops", "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, { "actual": null, "expected": "Computing", @@ -102,10 +112,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -116,9 +126,9 @@ "id": "crm-awareness-maps-to-sales", "mismatches": [ { - "actual": null, - "expected": "Software and Applications", - "path": "model_output.classification.iab_content.tier3.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "CRM education should resolve to the closest business/sales path, not generic software.", @@ -128,9 +138,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Robotics", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -142,7 +152,12 @@ "id": "crm-comparison-maps-to-sales", "mismatches": [ { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Robotics", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -159,9 +174,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Job Search", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -178,7 +193,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Job Search", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -196,7 +216,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Sports" + "model_output.classification.iab_content.tier1.label": "Science" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -205,7 +225,7 @@ "id": "ml-explanation-maps-to-ai", "mismatches": [ { - "actual": "Sports", + "actual": "Science", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" } @@ -217,10 +237,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Information and Network Security" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -231,7 +251,12 @@ "id": "support-credential-help-maps-to-business-it", "mismatches": [ { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Information and Network Security", "expected": "Internet", "path": "model_output.classification.iab_content.tier3.label" } @@ -267,9 +292,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sensitive Topics", + "model_output.classification.iab_content.tier2.label": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -281,12 +306,17 @@ "id": "trial-signup-maps-to-software", "mismatches": [ { - "actual": "Sports", + "actual": "Sensitive Topics", "expected": "Hobbies & Interests", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", "expected": "Content Production", "path": "model_output.classification.iab_content.tier2.label" }, @@ -376,21 +406,21 @@ "iab_cross_vertical_behavior_lock_regression": { "by_status": { "must_fix": { - "failed": 88, - "passed": 2, + "failed": 90, + "passed": 0, "total": 90 } }, "cases_path": "/content/agentic-intent-classifier/examples/iab_cross_vertical_behavior_lock_cases.json", "count": 90, - "failed": 88, - "passed": 2, + "failed": 90, + "passed": 0, "results": [ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Rentals" + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": "Insurance" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -400,7 +430,7 @@ "id": "auto-buying-easy", "mismatches": [ { - "actual": "Automotive", + "actual": "Personal Finance", "expected": "Travel", "path": "model_output.classification.iab_content.tier1.label" }, @@ -410,7 +440,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Auto Rentals", + "actual": "Insurance", "expected": "Travel Type", "path": "model_output.classification.iab_content.tier2.label" } @@ -422,7 +452,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, @@ -432,17 +462,23 @@ "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, "id": "auto-buying-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best used SUV for a family of four" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type" + "model_output.classification.iab_content.tier2.label": "Car Culture" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -450,18 +486,29 @@ "model_output.classification.iab_content.tier2.label": "Auto Type" }, "id": "auto-buying-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Car Culture", + "expected": "Auto Type", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": true, + "pass": false, "status": "must_fix", "text": "I need a shortlist of practical cars before making a purchase this month" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -472,9 +519,9 @@ "id": "sales-crm-easy", "mismatches": [ { - "actual": null, - "expected": "Software and Applications", - "path": "model_output.classification.iab_content.tier3.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical easy IAB mapping case for Business and Finance > Business > Sales.", @@ -484,9 +531,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Robotics", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -498,7 +545,12 @@ "id": "sales-crm-medium", "mismatches": [ { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Robotics", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -515,10 +567,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Sales" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -529,9 +581,9 @@ "id": "sales-crm-hard", "mismatches": [ { - "actual": null, - "expected": "Sales", - "path": "model_output.classification.iab_content.tier3.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Sales.", @@ -541,9 +593,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Hobbies & Interests", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -555,12 +607,17 @@ "id": "marketing-tools-easy", "mismatches": [ { - "actual": "Hobbies & Interests", + "actual": "Careers", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Job Search", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -577,9 +634,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sensitive Topics", + "model_output.classification.iab_content.tier2.label": "Terrorism" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -589,12 +646,17 @@ "id": "marketing-tools-medium", "mismatches": [ { - "actual": "Careers", + "actual": "Sensitive Topics", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Terrorism", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" } @@ -606,9 +668,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Home Utilities", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -625,7 +687,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Home Utilities", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -642,10 +709,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Information and Network Security" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -656,7 +723,12 @@ "id": "business-it-easy", "mismatches": [ { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Information and Network Security", "expected": "Internet", "path": "model_output.classification.iab_content.tier3.label" } @@ -668,9 +740,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -680,14 +752,9 @@ "id": "business-it-medium", "mismatches": [ { - "actual": "Personal Finance", - "expected": "Careers", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": null, - "expected": "Job Search", - "path": "model_output.classification.iab_content.tier2.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical medium IAB mapping case for Business and Finance > Business > Business I.T..", @@ -787,8 +854,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": "Dining Out" + "model_output.classification.iab_content.tier1.label": "Attractions", + "model_output.classification.iab_content.tier2.label": "Bars & Restaurants" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -797,10 +864,20 @@ }, "id": "dining-out-hard", "mismatches": [ + { + "actual": "Attractions", + "expected": "Food & Drink", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Bars & Restaurants", + "expected": "Dining Out", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Dining Out.", @@ -877,7 +954,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Sports" + "model_output.classification.iab_content.tier1.label": "Science" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -886,7 +963,7 @@ "id": "artificial-intelligence-easy", "mismatches": [ { - "actual": "Sports", + "actual": "Science", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" } @@ -946,7 +1023,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing" }, @@ -962,6 +1039,11 @@ "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, { "actual": "Computing", "expected": "Business", @@ -975,10 +1057,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier3.label": "Software and Applications", "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -991,7 +1073,12 @@ "id": "software-apps-medium", "mismatches": [ { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Software and Applications", "expected": "Internet", "path": "model_output.classification.iab_content.tier3.label" }, @@ -1008,9 +1095,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": "Virtual Reality", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1022,12 +1109,12 @@ "id": "software-apps-hard", "mismatches": [ { - "actual": "Business and Finance", - "expected": "Technology & Computing", - "path": "model_output.classification.iab_content.tier1.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": null, + "actual": "Virtual Reality", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1092,10 +1179,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier3.label": "Information and Network Security", "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -1108,7 +1195,12 @@ "id": "communication-software-medium", "mismatches": [ { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Information and Network Security", "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" }, @@ -1125,9 +1217,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": "Virtual Reality", "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, @@ -1141,12 +1233,12 @@ "id": "communication-software-hard", "mismatches": [ { - "actual": "Careers", - "expected": "Technology & Computing", - "path": "model_output.classification.iab_content.tier1.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": null, + "actual": "Virtual Reality", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1168,11 +1260,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null, - "model_output.classification.iab_content.tier4.label": null + "model_output.classification.iab_content.tier3.label": "Internet", + "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1184,14 +1276,9 @@ "id": "web-hosting-easy", "mismatches": [ { - "actual": null, - "expected": "Internet", - "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": null, - "expected": "Web Hosting", - "path": "model_output.classification.iab_content.tier4.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", @@ -1201,11 +1288,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null, - "model_output.classification.iab_content.tier4.label": null + "model_output.classification.iab_content.tier3.label": "Internet", + "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1217,14 +1304,9 @@ "id": "web-hosting-medium", "mismatches": [ { - "actual": null, - "expected": "Internet", - "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": null, - "expected": "Web Hosting", - "path": "model_output.classification.iab_content.tier4.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", @@ -1234,11 +1316,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null, - "model_output.classification.iab_content.tier4.label": null + "model_output.classification.iab_content.tier3.label": "Internet", + "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1250,14 +1332,9 @@ "id": "web-hosting-hard", "mismatches": [ { - "actual": null, - "expected": "Internet", - "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": null, - "expected": "Web Hosting", - "path": "model_output.classification.iab_content.tier4.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", @@ -1267,10 +1344,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": null, - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Computing", + "model_output.classification.iab_content.tier3.label": "Laptops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1281,14 +1358,9 @@ "id": "laptops-easy", "mismatches": [ { - "actual": null, - "expected": "Computing", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": null, - "expected": "Laptops", - "path": "model_output.classification.iab_content.tier3.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Laptops.", @@ -1324,10 +1396,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Laptops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1337,13 +1409,18 @@ }, "id": "laptops-hard", "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, { "actual": "Computing", "expected": "Consumer Electronics", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Laptops", "expected": "Smartphones", "path": "model_output.classification.iab_content.tier3.label" } @@ -1355,11 +1432,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": null + "model_output.classification.iab_content.tier4.label": "Computer Animation" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1371,7 +1448,12 @@ "id": "desktops-easy", "mismatches": [ { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Computer Animation", "expected": "Photo Editing Software", "path": "model_output.classification.iab_content.tier4.label" } @@ -1383,10 +1465,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Desktops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1397,9 +1479,9 @@ "id": "desktops-medium", "mismatches": [ { - "actual": null, - "expected": "Desktops", - "path": "model_output.classification.iab_content.tier3.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Desktops.", @@ -1462,7 +1544,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": "Wearable Technology" + "model_output.classification.iab_content.tier3.label": "Smartphones" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1476,11 +1558,6 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Wearable Technology", - "expected": "Smartphones", - "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", @@ -1493,7 +1570,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": "Wearable Technology" + "model_output.classification.iab_content.tier3.label": "Smartphones" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1507,11 +1584,6 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Wearable Technology", - "expected": "Smartphones", - "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", @@ -1523,8 +1595,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Designer Clothing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Men's Fashion", + "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1540,12 +1612,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Designer Clothing", + "actual": "Men's Fashion", "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Men's Shoes and Footwear", "expected": "Women's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -1595,8 +1667,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Designer Clothing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1610,16 +1682,6 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Designer Clothing", - "expected": "Women's Fashion", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": null, - "expected": "Women's Shoes and Footwear", - "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion.", @@ -1629,9 +1691,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Bodybuilding", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1648,7 +1710,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Bodybuilding", "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1732,9 +1799,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Children's Clothing", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1746,7 +1813,12 @@ "id": "mens-shoes-easy", "mismatches": [ { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Children's Clothing", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1794,9 +1866,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Children's Clothing", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1808,7 +1880,12 @@ "id": "mens-shoes-hard", "mismatches": [ { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Children's Clothing", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1887,7 +1964,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1900,11 +1977,6 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Travel Type", - "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical hard IAB mapping case for Travel > Travel Type > Hotels and Motels.", @@ -2003,10 +2075,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": null, - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", + "model_output.classification.iab_content.tier3.label": "Running and Jogging" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2022,12 +2094,17 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Fitness and Exercise", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Running and Jogging", "expected": "Green Solutions", "path": "model_output.classification.iab_content.tier3.label" } @@ -2081,8 +2158,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2093,11 +2170,21 @@ }, "id": "running-and-jogging-hard", "mismatches": [ + { + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, { "actual": null, "expected": "Running and Jogging", @@ -2216,8 +2303,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2226,10 +2313,20 @@ }, "id": "fiction-medium", "mismatches": [ + { + "actual": "Travel", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Type", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical medium IAB mapping case for Books and Literature > Fiction.", @@ -2263,7 +2360,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" + "model_output.classification.iab_content.tier2.label": "Interior Decorating" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2276,6 +2373,11 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Interior Decorating", + "expected": "Remodeling & Construction", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical easy IAB mapping case for Home & Garden > Home Improvement.", @@ -2285,9 +2387,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Interior Decorating", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2304,7 +2406,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Interior Decorating", "expected": "Personal Care", "path": "model_output.classification.iab_content.tier2.label" }, @@ -2355,9 +2462,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Augmented Reality" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2372,7 +2479,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Augmented Reality", "expected": "Language Learning", "path": "model_output.classification.iab_content.tier2.label" } @@ -2408,8 +2520,8 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2418,9 +2530,9 @@ "id": "online-education-hard", "mismatches": [ { - "actual": "Healthy Living", - "expected": "Careers", - "path": "model_output.classification.iab_content.tier1.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical hard IAB mapping case for Education > Online Education.", @@ -2511,10 +2623,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Medical Health", "model_output.classification.iab_content.tier2.label": "Diseases and Conditions", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Allergies" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2525,9 +2637,9 @@ "id": "medical-health-easy", "mismatches": [ { - "actual": null, - "expected": "Allergies", - "path": "model_output.classification.iab_content.tier3.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical easy IAB mapping case for Medical Health.", @@ -2540,7 +2652,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Medical Health", "model_output.classification.iab_content.tier2.label": "Diseases and Conditions", - "model_output.classification.iab_content.tier3.label": "Injuries", + "model_output.classification.iab_content.tier3.label": "Bone and Joint Conditions", "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -2557,6 +2669,11 @@ "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" }, + { + "actual": "Bone and Joint Conditions", + "expected": "Injuries", + "path": "model_output.classification.iab_content.tier3.label" + }, { "actual": null, "expected": "First Aid", @@ -2572,7 +2689,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Medical Health", - "model_output.classification.iab_content.tier2.label": "Surgery", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2594,7 +2711,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Surgery", + "actual": null, "expected": "Wellness", "path": "model_output.classification.iab_content.tier2.label" }, @@ -2635,10 +2752,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Sales" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2649,10 +2766,10 @@ "id": "careers-job-search-medium", "mismatches": [ { - "actual": null, - "expected": "Sales", - "path": "model_output.classification.iab_content.tier3.label" - } + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } ], "notes": "Cross-vertical medium IAB mapping case for Careers > Job Search.", "pass": false, @@ -2661,9 +2778,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Genres", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Talk Show" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2678,7 +2795,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Talk Show", "expected": "Job Search", "path": "model_output.classification.iab_content.tier2.label" } @@ -2690,9 +2812,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Personal Celebrations & Life Events", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Holidays", + "model_output.classification.iab_content.tier2.label": "National & Civic Holidays" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2702,12 +2824,17 @@ "id": "personal-finance-easy", "mismatches": [ { - "actual": "Personal Celebrations & Life Events", + "actual": "Holidays", "expected": "Food & Drink", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "National & Civic Holidays", "expected": "Food Movements", "path": "model_output.classification.iab_content.tier2.label" } @@ -2809,8 +2936,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Hobbies & Interests", - "model_output.classification.iab_content.tier2.label": "Content Production" + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Family/Children" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2820,7 +2947,7 @@ "id": "parenting-medium", "mismatches": [ { - "actual": "Hobbies & Interests", + "actual": "Genres", "expected": "Family and Relationships", "path": "model_output.classification.iab_content.tier1.label" }, @@ -2830,7 +2957,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Content Production", + "actual": "Family/Children", "expected": "Parenting", "path": "model_output.classification.iab_content.tier2.label" } @@ -2845,7 +2972,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting", - "model_output.classification.iab_content.tier3.label": "Special Needs Kids" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2859,6 +2986,11 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Special Needs Kids", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Family and Relationships > Parenting.", @@ -2890,9 +3022,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Home & Garden", + "model_output.classification.iab_content.tier2.label": "Gardening" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2902,14 +3034,9 @@ "id": "gardening-medium", "mismatches": [ { - "actual": "Food & Drink", - "expected": "Home & Garden", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": null, - "expected": "Gardening", - "path": "model_output.classification.iab_content.tier2.label" + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Cross-vertical medium IAB mapping case for Home & Garden > Gardening.", @@ -2966,8 +3093,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Genres", - "model_output.classification.iab_content.tier2.label": "Horror", + "model_output.classification.iab_content.tier1.label": "Entertainment", + "model_output.classification.iab_content.tier2.label": "Movies", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2979,7 +3106,7 @@ "id": "movies-medium", "mismatches": [ { - "actual": "Genres", + "actual": "Entertainment", "expected": "Video Gaming", "path": "model_output.classification.iab_content.tier1.label" }, @@ -2989,7 +3116,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Horror", + "actual": "Movies", "expected": "Video Game Genres", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3045,21 +3172,21 @@ "iab_cross_vertical_quality_target_eval": { "by_status": { "must_fix": { - "failed": 63, - "passed": 27, + "failed": 49, + "passed": 41, "total": 90 } }, "cases_path": "/content/agentic-intent-classifier/examples/iab_cross_vertical_mapping_cases.json", "count": 90, - "failed": 63, - "passed": 27, + "failed": 49, + "passed": 41, "results": [ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Rentals" + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": "Insurance" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -3068,13 +3195,18 @@ }, "id": "auto-buying-easy", "mismatches": [ + { + "actual": "Personal Finance", + "expected": "Automotive", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Auto Rentals", + "actual": "Insurance", "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -3086,7 +3218,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, @@ -3097,6 +3229,11 @@ }, "id": "auto-buying-medium", "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, { "actual": "Auto Body Styles", "expected": "Auto Buying and Selling", @@ -3110,9 +3247,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type" + "model_output.classification.iab_content.tier2.label": "Car Culture" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -3122,7 +3259,12 @@ "id": "auto-buying-hard", "mismatches": [ { - "actual": "Auto Type", + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Car Culture", "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -3134,10 +3276,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3152,18 +3294,13 @@ "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Computing", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Software and Applications", "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" } @@ -3175,9 +3312,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Robotics", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3194,12 +3331,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Robotics", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3216,10 +3348,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Sales" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3228,28 +3360,17 @@ "model_output.classification.iab_content.tier3.label": "Sales" }, "id": "sales-crm-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Sales", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Sales.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need software to manage leads and pipeline for a startup sales team" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Hobbies & Interests", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3261,17 +3382,12 @@ "id": "marketing-tools-easy", "mismatches": [ { - "actual": "Hobbies & Interests", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Job Search", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3288,9 +3404,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sensitive Topics", + "model_output.classification.iab_content.tier2.label": "Terrorism", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3302,17 +3418,12 @@ "id": "marketing-tools-medium", "mismatches": [ { - "actual": "Careers", + "actual": "Sensitive Topics", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Terrorism", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3329,9 +3440,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Home Utilities", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3348,12 +3459,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Home Utilities", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3370,10 +3476,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Information and Network Security" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3388,18 +3494,13 @@ "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Computing", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Information and Network Security", "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" } @@ -3411,9 +3512,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3425,17 +3526,12 @@ "id": "business-it-medium", "mismatches": [ { - "actual": "Personal Finance", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Job Search", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3536,8 +3632,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": "Dining Out" + "model_output.classification.iab_content.tier1.label": "Attractions", + "model_output.classification.iab_content.tier2.label": "Bars & Restaurants" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3545,9 +3641,20 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Attractions", + "expected": "Food & Drink", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Bars & Restaurants", + "expected": "Dining Out", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Dining Out.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a place to eat tonight where I can make a reservation online" }, @@ -3591,7 +3698,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" + "model_output.classification.iab_content.tier2.label": "Non-Alcoholic Beverages" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3599,16 +3706,22 @@ "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "id": "alcoholic-beverages-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Non-Alcoholic Beverages", + "expected": "Alcoholic Beverages", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Want a spirit-forward drink recommendation, not a restaurant suggestion" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier1.label": "Science", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -3619,7 +3732,7 @@ "id": "artificial-intelligence-easy", "mismatches": [ { - "actual": "Sports", + "actual": "Science", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, @@ -3699,10 +3812,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3711,29 +3824,18 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "software-apps-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Software and Applications", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best workflow software for a small operations team" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3742,28 +3844,17 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "software-apps-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Software and Applications", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need project management software for a distributed team" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": "Virtual Reality", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3775,17 +3866,7 @@ "id": "software-apps-hard", "mismatches": [ { - "actual": "Business and Finance", - "expected": "Technology & Computing", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Virtual Reality", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3845,10 +3926,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier3.label": "Information and Network Security", "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -3861,12 +3942,7 @@ "id": "communication-software-medium", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Information and Network Security", "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" }, @@ -3883,9 +3959,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": "Virtual Reality", "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, @@ -3899,17 +3975,7 @@ "id": "communication-software-hard", "mismatches": [ { - "actual": "Careers", - "expected": "Technology & Computing", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Virtual Reality", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3931,11 +3997,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null, - "model_output.classification.iab_content.tier4.label": null + "model_output.classification.iab_content.tier3.label": "Internet", + "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3945,35 +4011,19 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Internet", - "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": null, - "expected": "Web Hosting", - "path": "model_output.classification.iab_content.tier4.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Vercel vs Netlify for website hosting" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null, - "model_output.classification.iab_content.tier4.label": null + "model_output.classification.iab_content.tier3.label": "Internet", + "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3983,35 +4033,19 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Internet", - "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": null, - "expected": "Web Hosting", - "path": "model_output.classification.iab_content.tier4.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best hosting platform for a startup website" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null, - "model_output.classification.iab_content.tier4.label": null + "model_output.classification.iab_content.tier3.label": "Internet", + "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4021,34 +4055,18 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Internet", - "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": null, - "expected": "Web Hosting", - "path": "model_output.classification.iab_content.tier4.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a managed hosting provider to deploy and run our marketing site" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": null, - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Computing", + "model_output.classification.iab_content.tier3.label": "Laptops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4057,25 +4075,9 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Computing", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": null, - "expected": "Laptops", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which laptop should I buy for college?" }, @@ -4101,10 +4103,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Laptops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4113,26 +4115,15 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Laptops", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a portable computer with good battery life for everyday work" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Software and Applications" @@ -4145,11 +4136,6 @@ }, "id": "desktops-easy", "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Software and Applications", "expected": "Desktops", @@ -4163,10 +4149,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Desktops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4175,20 +4161,9 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Desktops", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which desktop computer should I buy for a home office?" }, @@ -4217,7 +4192,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": "Wearable Technology" + "model_output.classification.iab_content.tier3.label": "Smartphones" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4226,15 +4201,9 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-easy", - "mismatches": [ - { - "actual": "Wearable Technology", - "expected": "Smartphones", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best phone with a good camera under 700" }, @@ -4243,7 +4212,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": "Wearable Technology" + "model_output.classification.iab_content.tier3.label": "Smartphones" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4252,15 +4221,9 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-medium", - "mismatches": [ - { - "actual": "Wearable Technology", - "expected": "Smartphones", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Should I buy an iPhone or Pixel this year?" }, @@ -4269,7 +4232,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": "Wearable Technology" + "model_output.classification.iab_content.tier3.label": "Smartphones" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4278,15 +4241,9 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-hard", - "mismatches": [ - { - "actual": "Wearable Technology", - "expected": "Smartphones", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a new smartphone with strong battery life and a clean software experience" }, @@ -4358,9 +4315,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Bodybuilding", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -4377,12 +4334,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Bodybuilding", "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -4455,9 +4407,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Children's Clothing", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -4469,12 +4421,7 @@ "id": "mens-shoes-easy", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Children's Clothing", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -4511,9 +4458,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Children's Clothing", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -4525,12 +4472,7 @@ "id": "mens-shoes-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Children's Clothing", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -4589,7 +4531,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Travel Type", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -4600,11 +4542,6 @@ }, "id": "hotels-hard", "mismatches": [ - { - "actual": null, - "expected": "Travel Type", - "path": "model_output.classification.iab_content.tier2.label" - }, { "actual": null, "expected": "Hotels and Motels", @@ -4689,10 +4626,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": null, - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", + "model_output.classification.iab_content.tier3.label": "Running and Jogging" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4701,25 +4638,9 @@ "model_output.classification.iab_content.tier3.label": "Running and Jogging" }, "id": "running-and-jogging-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Fitness and Exercise", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": null, - "expected": "Running and Jogging", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best running plan for a first 10k" }, @@ -4762,8 +4683,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -4774,6 +4695,16 @@ }, "id": "running-and-jogging-hard", "mismatches": [ + { + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, { "actual": null, "expected": "Running and Jogging", @@ -4871,8 +4802,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -4881,10 +4812,20 @@ }, "id": "fiction-medium", "mismatches": [ + { + "actual": "Travel", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Type", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical medium IAB mapping case for Books and Literature > Fiction.", @@ -4914,7 +4855,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" + "model_output.classification.iab_content.tier2.label": "Interior Decorating" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4924,7 +4865,7 @@ "id": "home-improvement-easy", "mismatches": [ { - "actual": "Remodeling & Construction", + "actual": "Interior Decorating", "expected": "Home Improvement", "path": "model_output.classification.iab_content.tier2.label" } @@ -4936,9 +4877,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Interior Decorating" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4948,12 +4889,7 @@ "id": "home-improvement-medium", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Interior Decorating", "expected": "Home Improvement", "path": "model_output.classification.iab_content.tier2.label" } @@ -4994,9 +4930,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Augmented Reality" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5011,12 +4947,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Augmented Reality", "expected": "Online Education", "path": "model_output.classification.iab_content.tier2.label" } @@ -5057,9 +4988,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Vocational Training" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5069,17 +5000,12 @@ "id": "online-education-hard", "mismatches": [ { - "actual": "Healthy Living", + "actual": "Careers", "expected": "Education", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Vocational Training", "expected": "Online Education", "path": "model_output.classification.iab_content.tier2.label" } @@ -5151,7 +5077,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Medical Health" }, "expected": { @@ -5159,15 +5085,9 @@ "model_output.classification.iab_content.tier1.label": "Medical Health" }, "id": "medical-health-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Medical Health.", - "pass": false, + "pass": true, "status": "must_fix", "text": "what do these allergy symptoms mean" }, @@ -5229,7 +5149,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", "model_output.classification.iab_content.tier2.label": "Business" }, @@ -5245,11 +5165,6 @@ "expected": "Careers", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Business", "expected": "Job Search", @@ -5263,9 +5178,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Genres", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Talk Show" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5280,12 +5195,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Talk Show", "expected": "Job Search", "path": "model_output.classification.iab_content.tier2.label" } @@ -5297,9 +5207,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Personal Celebrations & Life Events", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Holidays", + "model_output.classification.iab_content.tier2.label": "National & Civic Holidays" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5309,17 +5219,12 @@ "id": "personal-finance-easy", "mismatches": [ { - "actual": "Personal Celebrations & Life Events", + "actual": "Holidays", "expected": "Personal Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "National & Civic Holidays", "expected": "Financial Planning", "path": "model_output.classification.iab_content.tier2.label" } @@ -5398,8 +5303,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Hobbies & Interests", - "model_output.classification.iab_content.tier2.label": "Content Production" + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Family/Children" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5409,12 +5314,12 @@ "id": "parenting-medium", "mismatches": [ { - "actual": "Hobbies & Interests", + "actual": "Genres", "expected": "Family and Relationships", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Content Production", + "actual": "Family/Children", "expected": "Parenting", "path": "model_output.classification.iab_content.tier2.label" } @@ -5462,9 +5367,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Home & Garden", + "model_output.classification.iab_content.tier2.label": "Gardening" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5472,25 +5377,9 @@ "model_output.classification.iab_content.tier2.label": "Gardening" }, "id": "gardening-medium", - "mismatches": [ - { - "actual": "Food & Drink", - "expected": "Home & Garden", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Gardening", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Home & Garden > Gardening.", - "pass": false, + "pass": true, "status": "must_fix", "text": "how often should i water tomato plants" }, @@ -5533,8 +5422,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Genres", - "model_output.classification.iab_content.tier2.label": "Horror" + "model_output.classification.iab_content.tier1.label": "Entertainment", + "model_output.classification.iab_content.tier2.label": "Movies" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5542,20 +5431,9 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-medium", - "mismatches": [ - { - "actual": "Genres", - "expected": "Entertainment", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "Horror", - "expected": "Movies", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Entertainment > Movies.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best thriller movies from the last few years" }, @@ -5594,7 +5472,7 @@ "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": null }, @@ -5609,11 +5487,6 @@ "actual": null, "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Vehicle shopping queries should map into the automotive buying branch, not business sales.", @@ -5643,7 +5516,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null @@ -5665,11 +5538,6 @@ "actual": null, "expected": "Laptops", "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Common typo handling should still land in the laptops branch.", @@ -5679,10 +5547,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -5703,9 +5571,14 @@ "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Software and Applications", "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "CRM education should resolve to the closest business/sales path, not generic software.", @@ -5715,9 +5588,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Robotics", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -5734,7 +5607,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "Robotics", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -5742,11 +5615,6 @@ "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Direct CRM vendor comparison should map cleanly into the sales domain.", @@ -5756,9 +5624,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Job Search", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -5775,7 +5643,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "Job Search", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -5783,11 +5651,6 @@ "actual": null, "expected": "Marketing and Advertising", "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Marketing tool discovery should map to the marketing and advertising branch.", @@ -5798,7 +5661,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier1.label": "Science", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -5809,7 +5672,7 @@ "id": "ml-explanation-maps-to-ai", "mismatches": [ { - "actual": "Sports", + "actual": "Science", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, @@ -5831,10 +5694,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Information and Network Security" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -5855,9 +5718,14 @@ "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Information and Network Security", "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Credential and account help should map to business IT rather than generic business.", @@ -5885,9 +5753,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sensitive Topics", + "model_output.classification.iab_content.tier2.label": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -5899,12 +5767,12 @@ "id": "trial-signup-maps-to-software", "mismatches": [ { - "actual": "Sports", + "actual": "Sensitive Topics", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -5912,6 +5780,11 @@ "actual": null, "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Software action queries should map to the software/application branch.", @@ -6388,9 +6261,9 @@ "heads": { "decision_phase": { "difficulty_benchmark": { - "accepted_accuracy": 0.981, + "accepted_accuracy": 0.9524, "accepted_coverage": 1.0, - "accuracy": 0.981, + "accuracy": 0.9524, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv", "count": 105, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase_benchmark.jsonl", @@ -6404,12 +6277,12 @@ "macro_f1": 0.9711 }, "hard": { - "accepted_accuracy": 0.9714, + "accepted_accuracy": 0.8857, "accepted_coverage": 1.0, - "accuracy": 0.9714, + "accuracy": 0.8857, "count": 35, "fallback_rate": 0.0, - "macro_f1": 0.9711 + "macro_f1": 0.8908 }, "medium": { "accepted_accuracy": 1.0, @@ -6422,13 +6295,13 @@ }, "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9812, + "macro_f1": 0.9536, "per_class_metrics": { - "accuracy": 0.9809523809523809, + "accuracy": 0.9523809523809523, "action": { - "f1-score": 1.0, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 1.0, + "recall": 0.9333333333333333, "support": 15.0 }, "awareness": { @@ -6438,21 +6311,21 @@ "support": 15.0 }, "consideration": { - "f1-score": 0.9655172413793104, + "f1-score": 0.9285714285714286, "precision": 1.0, - "recall": 0.9333333333333333, + "recall": 0.8666666666666667, "support": 15.0 }, "decision": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9333333333333333, + "precision": 0.9333333333333333, + "recall": 0.9333333333333333, "support": 15.0 }, "macro avg": { - "f1-score": 0.9812192118226601, - "precision": 0.9831932773109244, - "recall": 0.980952380952381, + "f1-score": 0.9536131694056934, + "precision": 0.9604010025062657, + "recall": 0.9523809523809524, "support": 105.0 }, "post_purchase": { @@ -6462,8 +6335,8 @@ "support": 15.0 }, "research": { - "f1-score": 0.9375, - "precision": 0.8823529411764706, + "f1-score": 0.8823529411764706, + "precision": 0.7894736842105263, "recall": 1.0, "support": 15.0 }, @@ -6474,9 +6347,9 @@ "support": 15.0 }, "weighted avg": { - "f1-score": 0.9812192118226601, - "precision": 0.9831932773109243, - "recall": 0.9809523809523809, + "f1-score": 0.9536131694056934, + "precision": 0.9604010025062656, + "recall": 0.9523809523809523, "support": 105.0 } }, @@ -6491,7 +6364,7 @@ "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/final_wave_cases.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.8823, + "macro_f1": 0.8876, "per_class_metrics": { "accuracy": 0.8888888888888888, "action": { @@ -6501,15 +6374,15 @@ "support": 0.0 }, "awareness": { - "f1-score": 0.8333333333333334, - "precision": 0.7142857142857143, + "f1-score": 0.9090909090909091, + "precision": 0.8333333333333334, "recall": 1.0, "support": 5.0 }, "consideration": { - "f1-score": 0.8888888888888888, + "f1-score": 0.75, "precision": 1.0, - "recall": 0.8, + "recall": 0.6, "support": 5.0 }, "decision": { @@ -6519,9 +6392,9 @@ "support": 5.0 }, "macro avg": { - "f1-score": 0.7562358276643991, - "precision": 0.7687074829931974, - "recall": 0.7571428571428571, + "f1-score": 0.7608225108225108, + "precision": 0.7761904761904762, + "recall": 0.7642857142857142, "support": 27.0 }, "post_purchase": { @@ -6531,9 +6404,9 @@ "support": 4.0 }, "research": { - "f1-score": 0.5714285714285714, - "precision": 0.6666666666666666, - "recall": 0.5, + "f1-score": 0.6666666666666666, + "precision": 0.6, + "recall": 0.75, "support": 4.0 }, "support": { @@ -6543,8 +6416,8 @@ "support": 4.0 }, "weighted avg": { - "f1-score": 0.8850676072898296, - "precision": 0.8977072310405644, + "f1-score": 0.8874859708193041, + "precision": 0.9098765432098765, "recall": 0.8888888888888888, "support": 27.0 } @@ -6621,17 +6494,17 @@ "suite": "hard_cases" }, "test": { - "accepted_accuracy": 0.7931, + "accepted_accuracy": 0.7586, "accepted_coverage": 1.0, - "accuracy": 0.7931, + "accuracy": 0.7586, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv", "count": 29, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/test.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.801, + "macro_f1": 0.7724, "per_class_metrics": { - "accuracy": 0.7931034482758621, + "accuracy": 0.7586206896551724, "action": { "f1-score": 1.0, "precision": 1.0, @@ -6651,20 +6524,20 @@ "support": 5.0 }, "decision": { - "f1-score": 1.0, + "f1-score": 0.8888888888888888, "precision": 1.0, - "recall": 1.0, + "recall": 0.8, "support": 5.0 }, "macro avg": { - "f1-score": 0.8010204081632653, - "precision": 0.8285714285714285, - "recall": 0.8214285714285714, + "f1-score": 0.7724489795918367, + "precision": 0.8095238095238095, + "recall": 0.7928571428571428, "support": 29.0 }, "post_purchase": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.8, + "precision": 0.6666666666666666, "recall": 1.0, "support": 4.0 }, @@ -6681,26 +6554,26 @@ "support": 4.0 }, "weighted avg": { - "f1-score": 0.7915982484948002, - "precision": 0.8344827586206897, - "recall": 0.7931034482758621, + "f1-score": 0.7601806239737274, + "precision": 0.8160919540229885, + "recall": 0.7586206896551724, "support": 29.0 } }, "suite": "test" }, "train": { - "accepted_accuracy": 0.9314, + "accepted_accuracy": 0.9412, "accepted_coverage": 1.0, - "accuracy": 0.9314, + "accuracy": 0.9412, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv", "count": 102, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/train.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9373, + "macro_f1": 0.9464, "per_class_metrics": { - "accuracy": 0.9313725490196079, + "accuracy": 0.9411764705882353, "action": { "f1-score": 1.0, "precision": 1.0, @@ -6714,9 +6587,9 @@ "support": 16.0 }, "consideration": { - "f1-score": 0.8484848484848485, - "precision": 0.875, - "recall": 0.8235294117647058, + "f1-score": 0.8387096774193549, + "precision": 0.9285714285714286, + "recall": 0.7647058823529411, "support": 17.0 }, "decision": { @@ -6726,21 +6599,21 @@ "support": 16.0 }, "macro avg": { - "f1-score": 0.9373409595183789, - "precision": 0.9401260504201681, - "recall": 0.9366096438575431, + "f1-score": 0.9463762044407206, + "precision": 0.9496465252767774, + "recall": 0.9479341736694679, "support": 102.0 }, "post_purchase": { - "f1-score": 0.9629629629629629, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.9285714285714286, + "recall": 1.0, "support": 14.0 }, "research": { - "f1-score": 0.8125, - "precision": 0.7647058823529411, - "recall": 0.8666666666666667, + "f1-score": 0.8484848484848485, + "precision": 0.7777777777777778, + "recall": 0.9333333333333333, "support": 15.0 }, "support": { @@ -6750,9 +6623,9 @@ "support": 14.0 }, "weighted avg": { - "f1-score": 0.9322769253786015, - "precision": 0.9353373702422145, - "recall": 0.9313725490196079, + "f1-score": 0.9410231345715216, + "precision": 0.946188279233262, + "recall": 0.9411764705882353, "support": 102.0 } }, @@ -6767,18 +6640,18 @@ "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/val.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.8612, + "macro_f1": 0.8567, "per_class_metrics": { "accuracy": 0.8620689655172413, "action": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8571428571428571, + "precision": 0.75, "recall": 1.0, "support": 3.0 }, "awareness": { - "f1-score": 0.8333333333333334, - "precision": 0.7142857142857143, + "f1-score": 0.9090909090909091, + "precision": 0.8333333333333334, "recall": 1.0, "support": 5.0 }, @@ -6795,8 +6668,8 @@ "support": 4.0 }, "macro avg": { - "f1-score": 0.8611626468769326, - "precision": 0.8877551020408163, + "f1-score": 0.8566790352504637, + "precision": 0.880952380952381, "recall": 0.8571428571428571, "support": 29.0 }, @@ -6807,20 +6680,20 @@ "support": 4.0 }, "research": { - "f1-score": 0.5714285714285714, - "precision": 0.6666666666666666, - "recall": 0.5, + "f1-score": 0.75, + "precision": 0.75, + "recall": 0.75, "support": 4.0 }, "support": { - "f1-score": 1.0, + "f1-score": 0.8571428571428571, "precision": 1.0, - "recall": 1.0, + "recall": 0.75, "support": 4.0 }, "weighted avg": { - "f1-score": 0.8570682191371846, - "precision": 0.8760262725779967, + "f1-score": 0.8602776533811015, + "precision": 0.8821839080459771, "recall": 0.8620689655172413, "support": 29.0 } @@ -6830,92 +6703,92 @@ }, "iab_content": { "cross_vertical_benchmark": { - "accepted_accuracy": 0.3, - "accepted_coverage": 0.8889, - "accuracy": 0.2667, + "accepted_accuracy": 0.427, + "accepted_coverage": 0.9889, + "accuracy": 0.4222, "count": 90, "dataset_path": "/content/agentic-intent-classifier/data/iab_cross_vertical_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.28, - "accepted_coverage": 0.8333, - "accuracy": 0.2333, + "accepted_accuracy": 0.4138, + "accepted_coverage": 0.9667, + "accuracy": 0.4, "count": 30, - "fallback_rate": 0.1667, - "macro_f1": 0.1556 + "fallback_rate": 0.0333, + "macro_f1": 0.2727 }, "hard": { - "accepted_accuracy": 0.3846, - "accepted_coverage": 0.8667, - "accuracy": 0.3333, + "accepted_accuracy": 0.4667, + "accepted_coverage": 1.0, + "accuracy": 0.4667, "count": 30, - "fallback_rate": 0.1333, - "macro_f1": 0.2083 + "fallback_rate": 0.0, + "macro_f1": 0.3106 }, "medium": { - "accepted_accuracy": 0.2414, - "accepted_coverage": 0.9667, - "accuracy": 0.2333, + "accepted_accuracy": 0.4, + "accepted_coverage": 1.0, + "accuracy": 0.4, "count": 30, - "fallback_rate": 0.0333, - "macro_f1": 0.1458 + "fallback_rate": 0.0, + "macro_f1": 0.2667 } }, - "fallback_rate": 0.1111, + "fallback_rate": 0.0111, "head": "iab_content", - "macro_f1": 0.1418, + "macro_f1": 0.227, "primary_source": "supervised_classifier", "suite": "cross_vertical_benchmark", "tier_metrics": { - "average_prediction_depth": 1.9556, + "average_prediction_depth": 2.4, "error_buckets": { - "exact_match": 24, - "parent_safe_stop": 10, - "right_tier1_wrong_tier2": 15, - "wrong_deep_leaf": 12, + "exact_match": 38, + "parent_safe_stop": 1, + "right_tier1_wrong_tier2": 14, + "wrong_deep_leaf": 8, "wrong_tier1": 29 }, - "exact_path_accuracy": 0.2667, - "parent_safe_accuracy": 0.4556, + "exact_path_accuracy": 0.4222, + "parent_safe_accuracy": 0.4444, "tier1_accuracy": 0.6778, - "tier2_accuracy": 0.4762, - "tier3_accuracy": 0.2143, - "tier4_accuracy": 0.0 + "tier2_accuracy": 0.4881, + "tier3_accuracy": 0.5238, + "tier4_accuracy": 0.5 }, "view_metrics": { "classifier": { - "average_prediction_depth": 1.9556, + "average_prediction_depth": 2.4, "error_buckets": { - "exact_match": 24, - "parent_safe_stop": 10, - "right_tier1_wrong_tier2": 15, - "wrong_deep_leaf": 12, + "exact_match": 37, + "parent_safe_stop": 1, + "right_tier1_wrong_tier2": 14, + "wrong_deep_leaf": 9, "wrong_tier1": 29 }, - "exact_path_accuracy": 0.2667, - "parent_safe_accuracy": 0.4556, + "exact_path_accuracy": 0.4111, + "parent_safe_accuracy": 0.4333, "tier1_accuracy": 0.6778, - "tier2_accuracy": 0.4762, - "tier3_accuracy": 0.2143, - "tier4_accuracy": 0.0 + "tier2_accuracy": 0.4881, + "tier3_accuracy": 0.4762, + "tier4_accuracy": 0.5 }, "combined_path": { - "average_prediction_depth": 1.9556, + "average_prediction_depth": 2.4, "error_buckets": { - "exact_match": 24, - "parent_safe_stop": 10, - "right_tier1_wrong_tier2": 15, - "wrong_deep_leaf": 12, + "exact_match": 37, + "parent_safe_stop": 1, + "right_tier1_wrong_tier2": 14, + "wrong_deep_leaf": 9, "wrong_tier1": 29 }, - "exact_path_accuracy": 0.2667, - "fallback_overuse_count": 17, - "fallback_rate": 0.1889, - "parent_safe_accuracy": 0.4556, + "exact_path_accuracy": 0.4111, + "fallback_overuse_count": 25, + "fallback_rate": 0.2778, + "parent_safe_accuracy": 0.4333, "tier1_accuracy": 0.6778, - "tier2_accuracy": 0.4762, - "tier3_accuracy": 0.2143, - "tier4_accuracy": 0.0 + "tier2_accuracy": 0.4881, + "tier3_accuracy": 0.4762, + "tier4_accuracy": 0.5 }, "disagreements": { "classifier_vs_combined": 0 @@ -6928,92 +6801,92 @@ } }, "difficulty_benchmark": { - "accepted_accuracy": 0.4219, - "accepted_coverage": 0.8205, - "accuracy": 0.3462, + "accepted_accuracy": 0.4231, + "accepted_coverage": 1.0, + "accuracy": 0.4231, "count": 156, "dataset_path": "/content/agentic-intent-classifier/data/iab_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.4889, - "accepted_coverage": 0.8654, - "accuracy": 0.4231, + "accepted_accuracy": 0.4615, + "accepted_coverage": 1.0, + "accuracy": 0.4615, "count": 52, - "fallback_rate": 0.1346, - "macro_f1": 0.2305 + "fallback_rate": 0.0, + "macro_f1": 0.2359 }, "hard": { - "accepted_accuracy": 0.3846, - "accepted_coverage": 0.75, - "accuracy": 0.2885, + "accepted_accuracy": 0.3654, + "accepted_coverage": 1.0, + "accuracy": 0.3654, "count": 52, - "fallback_rate": 0.25, - "macro_f1": 0.1638 + "fallback_rate": 0.0, + "macro_f1": 0.1892 }, "medium": { - "accepted_accuracy": 0.3864, - "accepted_coverage": 0.8462, - "accuracy": 0.3269, + "accepted_accuracy": 0.4423, + "accepted_coverage": 1.0, + "accuracy": 0.4423, "count": 52, - "fallback_rate": 0.1538, - "macro_f1": 0.1819 + "fallback_rate": 0.0, + "macro_f1": 0.2338 } }, - "fallback_rate": 0.1795, + "fallback_rate": 0.0, "head": "iab_content", - "macro_f1": 0.1478, + "macro_f1": 0.1524, "primary_source": "supervised_classifier", "suite": "difficulty_benchmark", "tier_metrics": { - "average_prediction_depth": 2.0256, + "average_prediction_depth": 2.4103, "error_buckets": { - "exact_match": 54, - "parent_safe_stop": 21, - "right_tier1_wrong_tier2": 37, - "wrong_deep_leaf": 3, - "wrong_tier1": 41 - }, - "exact_path_accuracy": 0.3462, - "parent_safe_accuracy": 0.6603, - "tier1_accuracy": 0.7372, - "tier2_accuracy": 0.5, - "tier3_accuracy": 0.3519, - "tier4_accuracy": 0.2917 + "exact_match": 66, + "parent_safe_stop": 1, + "right_tier1_wrong_tier2": 42, + "wrong_deep_leaf": 8, + "wrong_tier1": 39 + }, + "exact_path_accuracy": 0.4231, + "parent_safe_accuracy": 0.5385, + "tier1_accuracy": 0.75, + "tier2_accuracy": 0.4808, + "tier3_accuracy": 0.5093, + "tier4_accuracy": 0.4583 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.0256, + "average_prediction_depth": 2.4103, "error_buckets": { - "exact_match": 49, - "parent_safe_stop": 21, - "right_tier1_wrong_tier2": 37, - "wrong_deep_leaf": 8, - "wrong_tier1": 41 - }, - "exact_path_accuracy": 0.3141, - "parent_safe_accuracy": 0.6282, - "tier1_accuracy": 0.7372, - "tier2_accuracy": 0.5, - "tier3_accuracy": 0.3056, - "tier4_accuracy": 0.2083 + "exact_match": 59, + "parent_safe_stop": 1, + "right_tier1_wrong_tier2": 42, + "wrong_deep_leaf": 15, + "wrong_tier1": 39 + }, + "exact_path_accuracy": 0.3782, + "parent_safe_accuracy": 0.4936, + "tier1_accuracy": 0.75, + "tier2_accuracy": 0.4808, + "tier3_accuracy": 0.4259, + "tier4_accuracy": 0.1667 }, "combined_path": { - "average_prediction_depth": 2.0256, + "average_prediction_depth": 2.4103, "error_buckets": { - "exact_match": 49, - "parent_safe_stop": 21, - "right_tier1_wrong_tier2": 37, - "wrong_deep_leaf": 8, - "wrong_tier1": 41 - }, - "exact_path_accuracy": 0.3141, - "fallback_overuse_count": 14, - "fallback_rate": 0.0897, - "parent_safe_accuracy": 0.6282, - "tier1_accuracy": 0.7372, - "tier2_accuracy": 0.5, - "tier3_accuracy": 0.3056, - "tier4_accuracy": 0.2083 + "exact_match": 59, + "parent_safe_stop": 1, + "right_tier1_wrong_tier2": 42, + "wrong_deep_leaf": 15, + "wrong_tier1": 39 + }, + "exact_path_accuracy": 0.3782, + "fallback_overuse_count": 15, + "fallback_rate": 0.0962, + "parent_safe_accuracy": 0.4936, + "tier1_accuracy": 0.75, + "tier2_accuracy": 0.4808, + "tier3_accuracy": 0.4259, + "tier4_accuracy": 0.1667 }, "disagreements": { "classifier_vs_combined": 0 @@ -7026,26 +6899,26 @@ } }, "extended_cases": { - "accepted_accuracy": 0.4286, - "accepted_coverage": 0.875, - "accuracy": 0.375, + "accepted_accuracy": 0.5, + "accepted_coverage": 1.0, + "accuracy": 0.5, "count": 8, "dataset_path": "/content/agentic-intent-classifier/data/iab/extended_cases.jsonl", - "fallback_rate": 0.125, + "fallback_rate": 0.0, "head": "iab_content", - "macro_f1": 0.2308, + "macro_f1": 0.3333, "primary_source": "supervised_classifier", "suite": "extended_cases", "tier_metrics": { - "average_prediction_depth": 2.0, + "average_prediction_depth": 2.125, "error_buckets": { - "exact_match": 3, + "exact_match": 4, "right_tier1_wrong_tier2": 2, - "wrong_deep_leaf": 2, + "wrong_deep_leaf": 1, "wrong_tier1": 1 }, - "exact_path_accuracy": 0.375, - "parent_safe_accuracy": 0.375, + "exact_path_accuracy": 0.5, + "parent_safe_accuracy": 0.5, "tier1_accuracy": 0.875, "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, @@ -7053,32 +6926,32 @@ }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.0, + "average_prediction_depth": 2.125, "error_buckets": { - "exact_match": 3, + "exact_match": 4, "right_tier1_wrong_tier2": 2, - "wrong_deep_leaf": 2, + "wrong_deep_leaf": 1, "wrong_tier1": 1 }, - "exact_path_accuracy": 0.375, - "parent_safe_accuracy": 0.375, + "exact_path_accuracy": 0.5, + "parent_safe_accuracy": 0.5, "tier1_accuracy": 0.875, "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 }, "combined_path": { - "average_prediction_depth": 2.0, + "average_prediction_depth": 2.125, "error_buckets": { - "exact_match": 3, + "exact_match": 4, "right_tier1_wrong_tier2": 2, - "wrong_deep_leaf": 2, + "wrong_deep_leaf": 1, "wrong_tier1": 1 }, - "exact_path_accuracy": 0.375, + "exact_path_accuracy": 0.5, "fallback_overuse_count": 2, "fallback_rate": 0.25, - "parent_safe_accuracy": 0.375, + "parent_safe_accuracy": 0.5, "tier1_accuracy": 0.875, "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, @@ -7095,18 +6968,18 @@ } }, "hard_cases": { - "accepted_accuracy": 0.5, - "accepted_coverage": 0.75, + "accepted_accuracy": 0.4286, + "accepted_coverage": 0.875, "accuracy": 0.375, "count": 8, "dataset_path": "/content/agentic-intent-classifier/data/iab/hard_cases.jsonl", - "fallback_rate": 0.25, + "fallback_rate": 0.125, "head": "iab_content", - "macro_f1": 0.25, + "macro_f1": 0.2308, "primary_source": "supervised_classifier", "suite": "hard_cases", "tier_metrics": { - "average_prediction_depth": 2.0, + "average_prediction_depth": 2.25, "error_buckets": { "exact_match": 3, "right_tier1_wrong_tier2": 1, @@ -7121,7 +6994,7 @@ }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.0, + "average_prediction_depth": 2.25, "error_buckets": { "exact_match": 3, "right_tier1_wrong_tier2": 1, @@ -7135,7 +7008,7 @@ "tier4_accuracy": 0.0 }, "combined_path": { - "average_prediction_depth": 2.0, + "average_prediction_depth": 2.25, "error_buckets": { "exact_match": 3, "right_tier1_wrong_tier2": 1, @@ -7161,48 +7034,48 @@ } }, "test": { - "accepted_accuracy": 0.9242, - "accepted_coverage": 0.9973, - "accuracy": 0.922, + "accepted_accuracy": 0.943, + "accepted_coverage": 1.0, + "accuracy": 0.943, "count": 3282, "dataset_path": "/content/agentic-intent-classifier/data/iab/test.jsonl", - "fallback_rate": 0.0027, + "fallback_rate": 0.0, "head": "iab_content", - "macro_f1": 0.8741, + "macro_f1": 0.911, "primary_source": "supervised_classifier", "suite": "test", "tier_metrics": { - "average_prediction_depth": 2.1789, + "average_prediction_depth": 2.213, "error_buckets": { - "exact_match": 3026, - "parent_safe_stop": 68, - "right_tier1_wrong_tier2": 59, - "wrong_deep_leaf": 96, - "wrong_tier1": 33 - }, - "exact_path_accuracy": 0.922, - "parent_safe_accuracy": 0.9509, - "tier1_accuracy": 0.9899, - "tier2_accuracy": 0.9693, - "tier3_accuracy": 0.8477, - "tier4_accuracy": 0.5143 + "exact_match": 3095, + "parent_safe_stop": 45, + "right_tier1_wrong_tier2": 41, + "wrong_deep_leaf": 72, + "wrong_tier1": 29 + }, + "exact_path_accuracy": 0.943, + "parent_safe_accuracy": 0.958, + "tier1_accuracy": 0.9912, + "tier2_accuracy": 0.9776, + "tier3_accuracy": 0.9078, + "tier4_accuracy": 0.7 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.1789, + "average_prediction_depth": 2.213, "error_buckets": { - "exact_match": 2995, - "parent_safe_stop": 62, - "right_tier1_wrong_tier2": 71, - "wrong_deep_leaf": 121, - "wrong_tier1": 33 - }, - "exact_path_accuracy": 0.9126, - "parent_safe_accuracy": 0.9397, - "tier1_accuracy": 0.9899, - "tier2_accuracy": 0.9651, - "tier3_accuracy": 0.8218, - "tier4_accuracy": 0.3929 + "exact_match": 3052, + "parent_safe_stop": 44, + "right_tier1_wrong_tier2": 53, + "wrong_deep_leaf": 104, + "wrong_tier1": 29 + }, + "exact_path_accuracy": 0.9299, + "parent_safe_accuracy": 0.9445, + "tier1_accuracy": 0.9912, + "tier2_accuracy": 0.9734, + "tier3_accuracy": 0.8725, + "tier4_accuracy": 0.5 }, "combined_path": { "count": 3282, @@ -7224,48 +7097,48 @@ } }, "train": { - "accepted_accuracy": 0.93, - "accepted_coverage": 0.9978, - "accuracy": 0.9282, + "accepted_accuracy": 0.9459, + "accepted_coverage": 1.0, + "accuracy": 0.9459, "count": 13211, "dataset_path": "/content/agentic-intent-classifier/data/iab/train.jsonl", - "fallback_rate": 0.0022, + "fallback_rate": 0.0, "head": "iab_content", - "macro_f1": 0.8851, + "macro_f1": 0.9194, "primary_source": "supervised_classifier", "suite": "train", "tier_metrics": { - "average_prediction_depth": 2.172, + "average_prediction_depth": 2.2105, "error_buckets": { - "exact_match": 12263, - "parent_safe_stop": 259, - "right_tier1_wrong_tier2": 229, - "wrong_deep_leaf": 356, - "wrong_tier1": 104 - }, - "exact_path_accuracy": 0.9282, - "parent_safe_accuracy": 0.9572, - "tier1_accuracy": 0.9921, - "tier2_accuracy": 0.9726, - "tier3_accuracy": 0.8565, - "tier4_accuracy": 0.5518 + "exact_match": 12496, + "parent_safe_stop": 162, + "right_tier1_wrong_tier2": 144, + "wrong_deep_leaf": 284, + "wrong_tier1": 125 + }, + "exact_path_accuracy": 0.9459, + "parent_safe_accuracy": 0.9585, + "tier1_accuracy": 0.9905, + "tier2_accuracy": 0.9805, + "tier3_accuracy": 0.9135, + "tier4_accuracy": 0.7268 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.172, + "average_prediction_depth": 2.2105, "error_buckets": { - "exact_match": 12130, - "parent_safe_stop": 238, - "right_tier1_wrong_tier2": 277, - "wrong_deep_leaf": 462, - "wrong_tier1": 104 - }, - "exact_path_accuracy": 0.9182, - "parent_safe_accuracy": 0.9456, - "tier1_accuracy": 0.9921, - "tier2_accuracy": 0.9685, - "tier3_accuracy": 0.829, - "tier4_accuracy": 0.4214 + "exact_match": 12323, + "parent_safe_stop": 157, + "right_tier1_wrong_tier2": 192, + "wrong_deep_leaf": 414, + "wrong_tier1": 125 + }, + "exact_path_accuracy": 0.9328, + "parent_safe_accuracy": 0.945, + "tier1_accuracy": 0.9905, + "tier2_accuracy": 0.9764, + "tier3_accuracy": 0.8777, + "tier4_accuracy": 0.525 }, "combined_path": { "count": 13211, @@ -7287,48 +7160,48 @@ } }, "val": { - "accepted_accuracy": 0.9246, - "accepted_coverage": 0.9979, - "accuracy": 0.9229, + "accepted_accuracy": 0.9442, + "accepted_coverage": 1.0, + "accuracy": 0.9442, "count": 3282, "dataset_path": "/content/agentic-intent-classifier/data/iab/val.jsonl", - "fallback_rate": 0.0021, + "fallback_rate": 0.0, "head": "iab_content", - "macro_f1": 0.8789, + "macro_f1": 0.9166, "primary_source": "supervised_classifier", "suite": "val", "tier_metrics": { - "average_prediction_depth": 2.1789, + "average_prediction_depth": 2.2151, "error_buckets": { - "exact_match": 3029, - "parent_safe_stop": 69, - "right_tier1_wrong_tier2": 67, - "wrong_deep_leaf": 91, - "wrong_tier1": 26 - }, - "exact_path_accuracy": 0.9229, - "parent_safe_accuracy": 0.9549, - "tier1_accuracy": 0.9921, - "tier2_accuracy": 0.9686, - "tier3_accuracy": 0.8549, - "tier4_accuracy": 0.5286 + "exact_match": 3099, + "parent_safe_stop": 35, + "right_tier1_wrong_tier2": 45, + "wrong_deep_leaf": 72, + "wrong_tier1": 31 + }, + "exact_path_accuracy": 0.9442, + "parent_safe_accuracy": 0.9576, + "tier1_accuracy": 0.9906, + "tier2_accuracy": 0.9769, + "tier3_accuracy": 0.9088, + "tier4_accuracy": 0.7286 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.1789, + "average_prediction_depth": 2.2151, "error_buckets": { - "exact_match": 2997, - "parent_safe_stop": 64, - "right_tier1_wrong_tier2": 79, - "wrong_deep_leaf": 116, - "wrong_tier1": 26 - }, - "exact_path_accuracy": 0.9132, - "parent_safe_accuracy": 0.9436, - "tier1_accuracy": 0.9921, - "tier2_accuracy": 0.9644, - "tier3_accuracy": 0.829, - "tier4_accuracy": 0.4071 + "exact_match": 3056, + "parent_safe_stop": 34, + "right_tier1_wrong_tier2": 57, + "wrong_deep_leaf": 104, + "wrong_tier1": 31 + }, + "exact_path_accuracy": 0.9311, + "parent_safe_accuracy": 0.9442, + "tier1_accuracy": 0.9906, + "tier2_accuracy": 0.9727, + "tier3_accuracy": 0.8736, + "tier4_accuracy": 0.5286 }, "combined_path": { "count": 3282, @@ -7352,9 +7225,9 @@ }, "intent_subtype": { "difficulty_benchmark": { - "accepted_accuracy": 0.8967, - "accepted_coverage": 0.9783, - "accuracy": 0.8773, + "accepted_accuracy": 0.9104, + "accepted_coverage": 0.9675, + "accuracy": 0.8917, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv", "count": 277, "dataset_path": "/content/agentic-intent-classifier/data/subtype_benchmark.jsonl", @@ -7365,15 +7238,15 @@ "accuracy": 0.913, "count": 92, "fallback_rate": 0.0, - "macro_f1": 0.9111 + "macro_f1": 0.9109 }, "hard": { - "accepted_accuracy": 0.814, - "accepted_coverage": 0.9451, - "accuracy": 0.7692, + "accepted_accuracy": 0.8554, + "accepted_coverage": 0.9121, + "accuracy": 0.8132, "count": 91, - "fallback_rate": 0.0549, - "macro_f1": 0.7704 + "fallback_rate": 0.0879, + "macro_f1": 0.8025 }, "medium": { "accepted_accuracy": 0.957, @@ -7381,30 +7254,30 @@ "accuracy": 0.9468, "count": 94, "fallback_rate": 0.0106, - "macro_f1": 0.9453 + "macro_f1": 0.9469 } }, - "fallback_rate": 0.0217, + "fallback_rate": 0.0325, "head": "intent_subtype", - "macro_f1": 0.8767, + "macro_f1": 0.8886, "per_class_metrics": { "account_help": { - "f1-score": 0.7741935483870968, - "precision": 0.75, - "recall": 0.8, + "f1-score": 0.64, + "precision": 0.8, + "recall": 0.5333333333333333, "support": 15.0 }, - "accuracy": 0.8772563176895307, + "accuracy": 0.8916967509025271, "billing_help": { - "f1-score": 0.8, - "precision": 0.8, - "recall": 0.8, + "f1-score": 0.8387096774193549, + "precision": 0.8125, + "recall": 0.8666666666666667, "support": 15.0 }, "booking": { - "f1-score": 0.6956521739130435, + "f1-score": 0.9285714285714286, "precision": 1.0, - "recall": 0.5333333333333333, + "recall": 0.8666666666666667, "support": 15.0 }, "comparison": { @@ -7414,26 +7287,26 @@ "support": 15.0 }, "contact_sales": { - "f1-score": 0.9333333333333333, - "precision": 0.9333333333333333, - "recall": 0.9333333333333333, + "f1-score": 0.9375, + "precision": 0.8823529411764706, + "recall": 1.0, "support": 15.0 }, "deal_seeking": { - "f1-score": 0.9285714285714286, - "precision": 1.0, + "f1-score": 0.896551724137931, + "precision": 0.9285714285714286, "recall": 0.8666666666666667, "support": 15.0 }, "download": { - "f1-score": 0.896551724137931, - "precision": 0.9285714285714286, + "f1-score": 0.9285714285714286, + "precision": 1.0, "recall": 0.8666666666666667, "support": 15.0 }, "education": { - "f1-score": 0.9375, - "precision": 0.8823529411764706, + "f1-score": 0.9090909090909091, + "precision": 0.8333333333333334, "recall": 1.0, "support": 15.0 }, @@ -7444,84 +7317,84 @@ "support": 15.0 }, "evaluation": { - "f1-score": 0.8484848484848485, - "precision": 0.7777777777777778, - "recall": 0.9333333333333333, + "f1-score": 0.896551724137931, + "precision": 0.9285714285714286, + "recall": 0.8666666666666667, "support": 15.0 }, "follow_up": { - "f1-score": 0.9285714285714286, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.9333333333333333, "support": 15.0 }, "macro avg": { - "f1-score": 0.8767381318695205, - "precision": 0.893520983060569, - "recall": 0.8760257806826435, + "f1-score": 0.8886471209737711, + "precision": 0.8965122159975102, + "recall": 0.8895561002178651, "support": 277.0 }, "onboarding_setup": { - "f1-score": 0.9090909090909091, - "precision": 0.9375, - "recall": 0.8823529411764706, + "f1-score": 0.8648648648648649, + "precision": 0.8, + "recall": 0.9411764705882353, "support": 17.0 }, "product_discovery": { - "f1-score": 0.896551724137931, - "precision": 0.9285714285714286, - "recall": 0.8666666666666667, + "f1-score": 0.9032258064516129, + "precision": 0.875, + "recall": 0.9333333333333333, "support": 15.0 }, "provider_selection": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9696969696969697, + "precision": 0.9411764705882353, "recall": 1.0, "support": 16.0 }, "purchase": { - "f1-score": 0.9285714285714286, - "precision": 1.0, + "f1-score": 0.896551724137931, + "precision": 0.9285714285714286, "recall": 0.8666666666666667, "support": 15.0 }, "signup": { - "f1-score": 0.7777777777777778, - "precision": 0.7, - "recall": 0.875, + "f1-score": 0.8823529411764706, + "precision": 0.8333333333333334, + "recall": 0.9375, "support": 16.0 }, "task_execution": { - "f1-score": 0.8292682926829268, - "precision": 0.7391304347826086, - "recall": 0.9444444444444444, + "f1-score": 0.9230769230769231, + "precision": 0.8571428571428571, + "recall": 1.0, "support": 18.0 }, "troubleshooting": { - "f1-score": 0.8823529411764706, - "precision": 0.7894736842105263, - "recall": 1.0, + "f1-score": 0.8, + "precision": 0.8, + "recall": 0.8, "support": 15.0 }, "weighted avg": { - "f1-score": 0.8765453432446891, - "precision": 0.8918521903635431, - "recall": 0.8772563176895307, + "f1-score": 0.8891181699377334, + "precision": 0.8953221541324111, + "recall": 0.8916967509025271, "support": 277.0 } }, "suite": "difficulty_benchmark" }, "extended_cases": { - "accepted_accuracy": 0.8491, + "accepted_accuracy": 0.8302, "accepted_coverage": 1.0, - "accuracy": 0.8491, + "accuracy": 0.8302, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv", "count": 53, "dataset_path": "/content/agentic-intent-classifier/data/subtype/extended_cases.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.7764, + "macro_f1": 0.7668, "per_class_metrics": { "account_help": { "f1-score": 0.8, @@ -7529,7 +7402,7 @@ "recall": 0.6666666666666666, "support": 3.0 }, - "accuracy": 0.8490566037735849, + "accuracy": 0.8301886792452831, "billing_help": { "f1-score": 0.0, "precision": 0.0, @@ -7555,8 +7428,8 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.9, - "precision": 0.8181818181818182, + "f1-score": 0.8571428571428571, + "precision": 0.75, "recall": 1.0, "support": 9.0 }, @@ -7585,15 +7458,15 @@ "support": 3.0 }, "follow_up": { - "f1-score": 0.8, + "f1-score": 0.7368421052631579, "precision": 1.0, - "recall": 0.6666666666666666, + "recall": 0.5833333333333334, "support": 12.0 }, "macro avg": { - "f1-score": 0.474472286972287, - "precision": 0.46035754369087706, - "recall": 0.5185185185185186, + "f1-score": 0.46858256266151, + "precision": 0.4565696649029982, + "recall": 0.513888888888889, "support": 53.0 }, "onboarding_setup": { @@ -7639,9 +7512,9 @@ "support": 1.0 }, "weighted avg": { - "f1-score": 0.823438668249989, - "precision": 0.8324076342944268, - "recall": 0.8490566037735849, + "f1-score": 0.8018611395225099, + "precision": 0.8208295896975142, + "recall": 0.8301886792452831, "support": 53.0 } }, @@ -7656,7 +7529,7 @@ "dataset_path": "/content/agentic-intent-classifier/data/subtype/hard_cases.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.846, + "macro_f1": 0.8447, "per_class_metrics": { "account_help": { "f1-score": 0.8, @@ -7702,8 +7575,8 @@ "support": 0.0 }, "education": { - "f1-score": 0.9666666666666667, - "precision": 0.9354838709677419, + "f1-score": 0.9508196721311475, + "precision": 0.90625, "recall": 1.0, "support": 29.0 }, @@ -7714,9 +7587,9 @@ "support": 0.0 }, "evaluation": { - "f1-score": 0.0, - "precision": 0.0, - "recall": 0.0, + "f1-score": 0.25, + "precision": 0.5, + "recall": 0.16666666666666666, "support": 6.0 }, "follow_up": { @@ -7726,9 +7599,9 @@ "support": 12.0 }, "macro avg": { - "f1-score": 0.7049919484702094, - "precision": 0.7038231780167263, - "recall": 0.7305555555555556, + "f1-score": 0.7038911013311109, + "precision": 0.7234953703703704, + "recall": 0.7212962962962962, "support": 94.0 }, "onboarding_setup": { @@ -7738,8 +7611,8 @@ "support": 6.0 }, "product_discovery": { - "f1-score": 0.8, - "precision": 0.6666666666666666, + "f1-score": 0.8888888888888888, + "precision": 0.8, "recall": 1.0, "support": 8.0 }, @@ -7750,14 +7623,14 @@ "support": 10.0 }, "purchase": { - "f1-score": 1.0, + "f1-score": 0.8, "precision": 1.0, - "recall": 1.0, + "recall": 0.6666666666666666, "support": 3.0 }, "signup": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8571428571428571, + "precision": 0.75, "recall": 1.0, "support": 3.0 }, @@ -7774,8 +7647,8 @@ "support": 3.0 }, "weighted avg": { - "f1-score": 0.8721091581868641, - "precision": 0.8648478609013955, + "f1-score": 0.8798004011763282, + "precision": 0.8911125886524823, "recall": 0.8936170212765957, "support": 94.0 } @@ -7783,15 +7656,15 @@ "suite": "hard_cases" }, "test": { - "accepted_accuracy": 0.9143, + "accepted_accuracy": 0.9, "accepted_coverage": 1.0, - "accuracy": 0.9143, + "accuracy": 0.9, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv", "count": 70, "dataset_path": "/content/agentic-intent-classifier/data/subtype/test.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.8855, + "macro_f1": 0.8531, "per_class_metrics": { "account_help": { "f1-score": 1.0, @@ -7799,7 +7672,7 @@ "recall": 1.0, "support": 2.0 }, - "accuracy": 0.9142857142857143, + "accuracy": 0.9, "billing_help": { "f1-score": 0.0, "precision": 0.0, @@ -7825,8 +7698,8 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.8, - "precision": 0.6666666666666666, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 2.0 }, @@ -7849,9 +7722,9 @@ "support": 5.0 }, "evaluation": { - "f1-score": 0.5, - "precision": 0.5, - "recall": 0.5, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 2.0 }, "follow_up": { @@ -7861,14 +7734,14 @@ "support": 11.0 }, "macro avg": { - "f1-score": 0.6887592108100274, - "precision": 0.7, - "recall": 0.6978114478114478, + "f1-score": 0.6635221022395855, + "precision": 0.6578042328042328, + "recall": 0.6885521885521885, "support": 70.0 }, "onboarding_setup": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8888888888888888, + "precision": 0.8, "recall": 1.0, "support": 4.0 }, @@ -7879,9 +7752,9 @@ "support": 8.0 }, "provider_selection": { - "f1-score": 0.9090909090909091, + "f1-score": 0.8, "precision": 1.0, - "recall": 0.8333333333333334, + "recall": 0.6666666666666666, "support": 6.0 }, "purchase": { @@ -7897,59 +7770,59 @@ "support": 2.0 }, "task_execution": { - "f1-score": 0.8571428571428571, - "precision": 0.75, + "f1-score": 0.9230769230769231, + "precision": 0.8571428571428571, "recall": 1.0, "support": 6.0 }, "troubleshooting": { - "f1-score": 0.6666666666666666, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.5, + "recall": 1.0, "support": 2.0 }, "weighted avg": { - "f1-score": 0.9126080539458813, - "precision": 0.9307142857142858, - "recall": 0.9142857142857143, + "f1-score": 0.8939882610403741, + "precision": 0.9094217687074829, + "recall": 0.9, "support": 70.0 } }, "suite": "test" }, "train": { - "accepted_accuracy": 0.9042, + "accepted_accuracy": 0.8978, "accepted_coverage": 1.0, - "accuracy": 0.9042, + "accuracy": 0.8978, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv", "count": 313, "dataset_path": "/content/agentic-intent-classifier/data/subtype/train.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.8789, + "macro_f1": 0.877, "per_class_metrics": { "account_help": { - "f1-score": 0.875, - "precision": 0.7777777777777778, - "recall": 1.0, + "f1-score": 0.7142857142857143, + "precision": 0.7142857142857143, + "recall": 0.7142857142857143, "support": 7.0 }, - "accuracy": 0.9041533546325878, + "accuracy": 0.8977635782747604, "billing_help": { - "f1-score": 0.8, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.6666666666666666, + "recall": 1.0, "support": 6.0 }, "booking": { - "f1-score": 0.6, - "precision": 0.6, - "recall": 0.6, + "f1-score": 0.8333333333333334, + "precision": 0.7142857142857143, + "recall": 1.0, "support": 5.0 }, "comparison": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.967741935483871, + "precision": 0.9375, "recall": 1.0, "support": 15.0 }, @@ -7960,8 +7833,8 @@ "support": 9.0 }, "deal_seeking": { - "f1-score": 0.9523809523809523, - "precision": 1.0, + "f1-score": 0.9090909090909091, + "precision": 0.9090909090909091, "recall": 0.9090909090909091, "support": 11.0 }, @@ -7972,8 +7845,8 @@ "support": 8.0 }, "education": { - "f1-score": 0.9719626168224299, - "precision": 0.9454545454545454, + "f1-score": 0.9629629629629629, + "precision": 0.9285714285714286, "recall": 1.0, "support": 52.0 }, @@ -7984,33 +7857,33 @@ "support": 20.0 }, "evaluation": { - "f1-score": 0.64, + "f1-score": 0.6923076923076923, "precision": 1.0, - "recall": 0.47058823529411764, + "recall": 0.5294117647058824, "support": 17.0 }, "follow_up": { - "f1-score": 0.8611111111111112, - "precision": 0.8611111111111112, - "recall": 0.8611111111111112, + "f1-score": 0.8571428571428571, + "precision": 0.8823529411764706, + "recall": 0.8333333333333334, "support": 36.0 }, "macro avg": { - "f1-score": 0.8788540112011829, - "precision": 0.9092031425364758, - "recall": 0.8729694019289211, + "f1-score": 0.8770498618135788, + "precision": 0.8988923431325393, + "recall": 0.876671278202288, "support": 313.0 }, "onboarding_setup": { - "f1-score": 0.9375, + "f1-score": 0.9696969696969697, "precision": 1.0, - "recall": 0.8823529411764706, + "recall": 0.9411764705882353, "support": 17.0 }, "product_discovery": { - "f1-score": 0.8955223880597015, - "precision": 0.8333333333333334, - "recall": 0.967741935483871, + "f1-score": 0.90625, + "precision": 0.8787878787878788, + "recall": 0.9354838709677419, "support": 31.0 }, "provider_selection": { @@ -8020,48 +7893,48 @@ "support": 25.0 }, "purchase": { - "f1-score": 1.0, + "f1-score": 0.8, "precision": 1.0, - "recall": 1.0, + "recall": 0.6666666666666666, "support": 6.0 }, "signup": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.8648648648648649, + "precision": 0.7619047619047619, "recall": 1.0, "support": 16.0 }, "task_execution": { - "f1-score": 0.8837209302325582, - "precision": 0.7916666666666666, - "recall": 1.0, + "f1-score": 0.8292682926829268, + "precision": 0.7727272727272727, + "recall": 0.8947368421052632, "support": 19.0 }, "troubleshooting": { - "f1-score": 0.8333333333333334, - "precision": 0.9090909090909091, + "f1-score": 0.8, + "precision": 0.8333333333333334, "recall": 0.7692307692307693, "support": 13.0 }, "weighted avg": { - "f1-score": 0.8995750585641842, - "precision": 0.9142834091715881, - "recall": 0.9041533546325878, + "f1-score": 0.894423568060199, + "precision": 0.9063956713482179, + "recall": 0.8977635782747604, "support": 313.0 } }, "suite": "train" }, "val": { - "accepted_accuracy": 0.8734, + "accepted_accuracy": 0.8608, "accepted_coverage": 0.9875, - "accuracy": 0.875, + "accuracy": 0.85, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv", "count": 80, "dataset_path": "/content/agentic-intent-classifier/data/subtype/val.jsonl", "fallback_rate": 0.0125, "head": "intent_subtype", - "macro_f1": 0.7429, + "macro_f1": 0.6722, "per_class_metrics": { "account_help": { "f1-score": 0.5, @@ -8069,11 +7942,11 @@ "recall": 0.5, "support": 2.0 }, - "accuracy": 0.875, + "accuracy": 0.85, "billing_help": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 1.0 }, "booking": { @@ -8083,8 +7956,8 @@ "support": 3.0 }, "comparison": { - "f1-score": 0.6666666666666666, - "precision": 1.0, + "f1-score": 0.5, + "precision": 0.5, "recall": 0.5, "support": 4.0 }, @@ -8095,8 +7968,8 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.6666666666666666, - "precision": 0.5, + "f1-score": 0.8, + "precision": 0.6666666666666666, "recall": 1.0, "support": 2.0 }, @@ -8131,9 +8004,9 @@ "support": 11.0 }, "macro avg": { - "f1-score": 0.660381593714927, - "precision": 0.6597643097643098, - "recall": 0.695959595959596, + "f1-score": 0.5974890931031281, + "precision": 0.5811447811447812, + "recall": 0.6353535353535353, "support": 80.0 }, "onboarding_setup": { @@ -8143,9 +8016,9 @@ "support": 5.0 }, "product_discovery": { - "f1-score": 0.9090909090909091, - "precision": 0.9090909090909091, - "recall": 0.9090909090909091, + "f1-score": 0.8571428571428571, + "precision": 0.9, + "recall": 0.8181818181818182, "support": 11.0 }, "provider_selection": { @@ -8167,8 +8040,8 @@ "support": 2.0 }, "task_execution": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.8421052631578947, + "precision": 0.7272727272727273, "recall": 1.0, "support": 8.0 }, @@ -8179,9 +8052,9 @@ "support": 1.0 }, "weighted avg": { - "f1-score": 0.8673611111111112, - "precision": 0.8841666666666667, - "recall": 0.875, + "f1-score": 0.8380398913951546, + "precision": 0.8423106060606059, + "recall": 0.85, "support": 80.0 } }, @@ -8304,12 +8177,12 @@ }, "hard_cases": { "accepted_accuracy": 1.0, - "accepted_coverage": 1.0, + "accepted_coverage": 0.9836, "accuracy": 1.0, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_hard_cases_confusion_matrix.csv", "count": 61, "dataset_path": "/content/agentic-intent-classifier/data/hard_cases.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0164, "head": "intent_type", "macro_f1": 1.0, "per_class_metrics": {