diff --git a/.gitignore b/.gitignore index d000133179e90f0519a3b1315b4bd68c699185f8..f5dd181180f6006854ca19deff550ad922205dda 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ /trained_models/ /temp/ +**/*.csv +**/*.onnx #**/*.wav **/*.xlsx -**/*.onnx diff --git a/examples/download_wav/Temp Query 5_20251008-093912.csv b/examples/download_wav/Temp Query 5_20251008-093912.csv new file mode 100644 index 0000000000000000000000000000000000000000..1df659d610eff85c19040c8969323ef2e25d05ee --- /dev/null +++ b/examples/download_wav/Temp Query 5_20251008-093912.csv @@ -0,0 +1,101 @@ +date,overdue_term,id,case_id,credit_user_id,call_start_timestamp,call_end_timestamp,thirdpart_download_url +11/10/2025,M3,201577107,62145483,2.05158E+18,1760156453,1760156464,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/6b76d306-b767-44e5-be9a-0a15d1165113.mp3 +11/10/2025,M3,201552895,61647547,2.04871E+18,1760150223,1760150235,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/91eb4d93-aaaf-4a22-b1b5-93f90790f360.mp3 +11/10/2025,M1,201571248,64869969,1.63814E+18,1760154872,1760154878,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/9feab432-a05f-4c12-a7a5-1de81c5c5552.mp3 +10/10/2025,M5,201481243,57774660,1.86995E+18,1760093720,1760093736,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/8ca27707-73e9-41a9-a531-f84011a2d021.mp3 +11/10/2025,M6,201602065,56556981,1.96434E+18,1760162403,1760162411,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/23edb55b-d7d7-496d-92d9-27be9a8d0f06.mp3 +10/10/2025,M3,201432876,62937736,1.71926E+18,1760081217,1760081223,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/d64b9511-1ada-435c-bf1d-7ff8edd194d1.mp3 +10/10/2025,M2,201418064,63818662,2.06059E+18,1760078017,1760078023,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/7a53b0cf-d4a8-496b-8533-578e1b3c8050.mp3 +11/10/2025,M1,201546922,65604125,1.86304E+18,1760149167,1760149175,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/e6adf4e4-269c-4668-955d-7b3dd1c60736.mp3 +10/10/2025,M3,201430098,61807602,1.85118E+18,1760080774,1760080785,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/b2b8e1e5-d92d-424d-9150-7af7506305c4.mp3 +10/10/2025,M1,201448566,64796208,1.65278E+18,1760085408,1760085415,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/09d6248c-111d-4b73-910f-e218049185d8.mp3 +11/10/2025,M4,201571566,60538522,1.88122E+18,1760154923,1760154930,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/944211c2-e492-4889-b655-e508fdd5879d.mp3 +11/10/2025,M1,201566967,65843234,2.02107E+18,1760154065,1760154073,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/bfa8591b-e527-45e9-ae57-7f74f9e7302b.mp3 +10/10/2025,M2,201447321,64267309,1.56498E+18,1760085020,1760085033,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/4bb8e69a-4c6c-4828-857a-ce0e43cc75a1.mp3 +11/10/2025,M1,201568415,65114574,4883832,1760154398,1760154405,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/51e805c9-5cc5-490b-bae2-7d5eab6f343c.mp3 +11/10/2025,M2,201605984,63943082,1.8374E+18,1760163009,1760163030,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/98af45d4-16cd-4eaf-9b91-05c7143a26bc.mp3 +10/10/2025,M1,201419656,66515322,1.49814E+17,1760078339,1760078345,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/91f43558-6567-43d9-a709-ffa3173c81b4.mp3 +10/10/2025,M2,201427406,63880041,1.56918E+18,1760080267,1760080275,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/375fdc20-428e-4658-aeb7-cb9cca17c534.mp3 +11/10/2025,M1,201575782,64887894,1.73042E+18,1760156066,1760156082,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/b2d6b5a2-3d7f-4fc7-abf6-258cd7a7de5f.mp3 +10/10/2025,M3,201418794,62368390,1.94558E+18,1760078142,1760078156,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/ba673b33-8496-4901-8f2f-8083802a5213.mp3 +10/10/2025,M1,201424572,66395236,1203507,1760079797,1760079804,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/a95596c0-8b46-4333-b126-6c4c11ca41fc.mp3 +11/10/2025,M2,201571228,64248917,1.88019E+18,1760154871,1760154880,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/e4eb3384-2d75-4563-b9a7-7bb3256a2aae.mp3 +11/10/2025,M4,201570642,60447265,1.98507E+18,1760154782,1760154787,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/a5440c2e-3fc5-43b2-8ecf-d4bde44f62e8.mp3 +10/10/2025,M5,201453357,58652419,1.85737E+18,1760086342,1760086350,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/68def6c9-130e-4705-b4b6-19d5d8b4b27d.mp3 +11/10/2025,M6,201573623,57234397,1.97251E+18,1760155303,1760155313,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/0ba67a44-c94a-4d68-a6ba-d003cf4b57c8.mp3 +10/10/2025,M5,201424683,57553385,1.86241E+18,1760079814,1760079831,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/50c26933-40a1-45bc-baf0-eb7b4c268ffb.mp3 +11/10/2025,M1,201570171,66334366,1.8276E+18,1760154717,1760154723,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/ecbb5b43-19cf-44e2-8ee4-ea131c706421.mp3 +10/10/2025,M4,201451276,59840709,2.04014E+18,1760085947,1760085952,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/6059e9c2-2f3c-4cef-a536-4796436b9765.mp3 +10/10/2025,M4,201432508,59867441,1.91396E+18,1760081156,1760081172,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/c0814466-4e13-4ef1-b4be-2fbe66eebfd8.mp3 +11/10/2025,M5,201612109,58418373,1.87946E+18,1760163975,1760163983,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/c7c54616-575a-4dc9-9820-fea245211933.mp3 +10/10/2025,M2,201432653,64650851,1.74295E+18,1760081176,1760081192,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/e8cc82f1-ded6-484f-9e6a-b9bf514eda04.mp3 +11/10/2025,M1,201580231,65755142,1.96545E+18,1760157408,1760157415,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/d2ed8a99-d8a4-4c0a-9b30-0a4f97e8db6e.mp3 +10/10/2025,M3,201430023,61812734,7400607,1760080766,1760080780,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/f693b1d5-dc99-43d9-a730-2040ca645f17.mp3 +10/10/2025,M3,201450322,62009884,2.007E+18,1760085762,1760085768,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/4915b87b-1169-41f7-af8c-6509a66dfbe6.mp3 +10/10/2025,M3,201431281,62172812,2.05076E+18,1760080963,1760080975,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/dc9c5331-04bb-4cbf-b789-61b0b811d6b6.mp3 +10/10/2025,M5,201430080,58314791,1.99801E+18,1760080772,1760080777,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/e571b6f4-6471-4311-81ed-cd1af7e55e07.mp3 +11/10/2025,M4,201538284,59471661,2.03412E+18,1760147632,1760147638,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/cfbe5f77-02b3-4a04-8948-9ca4237c3abc.mp3 +10/10/2025,M1,201447219,65817559,2.02777E+18,1760085001,1760085007,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/49dad1dd-d681-410a-87df-649eca036ff0.mp3 +10/10/2025,M1,201481818,66043196,1.93698E+18,1760093888,1760093894,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/93c19c1f-93fa-41ec-8f57-54a606a7f4a4.mp3 +10/10/2025,M1,201485519,66563695,2.074E+18,1760095020,1760095034,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/e8c214bd-6261-42a7-863b-3bcbd82f081e.mp3 +11/10/2025,M5,201595914,59014301,1.5286E+18,1760161399,1760161415,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/095bcf3f-1807-4c71-a997-ce6806b4da99.mp3 +11/10/2025,M2,201533318,63403949,1.88317E+18,1760146862,1760146871,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/8ae1702d-eca8-4e77-b39d-fd119c72499e.mp3 +11/10/2025,M4,201576553,60295505,1.80115E+18,1760156314,1760156320,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/47bad51d-65fb-4795-9211-a8a0434b95ad.mp3 +10/10/2025,M1,201485741,65280144,2.07517E+18,1760095107,1760095114,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/87d8ccb1-f7e9-4377-a1d8-7bf9228e0c3f.mp3 +10/10/2025,M1,201431349,64854591,1.58546E+18,1760080970,1760080978,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/6f983610-0fd3-4da6-be49-1cc50f205618.mp3 +10/10/2025,M5,201487648,57318618,1.64505E+18,1760095790,1760095797,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/254f796e-b955-4a5e-a190-1d3579474645.mp3 +11/10/2025,M1,201577796,64963614,1.86777E+18,1760156648,1760156657,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/b382a477-933e-43b1-8759-a2b4e17e92b2.mp3 +11/10/2025,M3,201533254,62192769,2.023E+18,1760146812,1760146817,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/ac78c5f9-7109-45fc-87da-5e275e0159f4.mp3 +10/10/2025,M1,201428974,65690748,1.89974E+18,1760080550,1760080557,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/b4e53f33-4c55-4b16-b786-6619332f47fc.mp3 +11/10/2025,M2,201546294,64684883,1.93667E+18,1760149074,1760149090,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/337edca5-80fa-4f9d-8820-3fb333b384d5.mp3 +10/10/2025,M1,201475668,66214001,1.80957E+18,1760092690,1760092695,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/21e2b2ec-2475-4692-9b67-2453e262e77b.mp3 +10/10/2025,M5,201459599,57741938,6633631,1760087612,1760087617,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/02bd6e79-efc1-4cfd-bee9-12967c844735.mp3 +11/10/2025,M3,201551060,61444707,1.94443E+18,1760149891,1760149899,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/700c2e32-c5bd-48f3-8085-0617694963dd.mp3 +10/10/2025,M4,201453055,59345041,1.68365E+18,1760086275,1760086282,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/4f1ef95b-3432-41d3-8970-1698504ba010.mp3 +10/10/2025,M1,201426891,66520128,1728931,1760080190,1760080196,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/ec7180ac-2948-49a3-b709-f60b80dfee27.mp3 +12/10/2025,M1,201704571,66237684,1.77477E+18,1760229906,1760229911,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251012/21962/b70ed0f3-145f-49a6-9dbb-aa695e21d7de.mp3 +10/10/2025,M1,201457899,65109188,1.58483E+18,1760087330,1760087339,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/c5f910f6-6a5e-4bd0-8a64-f2805837558d.mp3 +11/10/2025,M2,201537965,63920995,2.04332E+18,1760147578,1760147593,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/38ff4d8b-4db3-4410-902e-3953699bf4eb.mp3 +11/10/2025,M1,201568081,66644507,1.92267E+18,1760154313,1760154318,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/a801cfcc-40d2-4dde-9994-b709836df856.mp3 +11/10/2025,M3,201539641,62112487,1.75162E+18,1760147868,1760147875,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/56ec2c07-3bf5-47ba-82c6-039bf094d6ca.mp3 +10/10/2025,M1,201483514,65958944,1.96386E+18,1760094328,1760094335,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/edd080b5-2439-4079-a27f-2f0941217825.mp3 +10/10/2025,M5,201417598,57494166,1.59238E+18,1760077922,1760077928,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/65750a23-6edc-479a-bea4-14ba9e43648e.mp3 +11/10/2025,M1,201528466,65224705,1.75014E+18,1760145272,1760145278,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/5b63572f-79d2-4d22-9ded-4fa2e3cd372b.mp3 +10/10/2025,M5,201453641,58921447,1.92301E+18,1760086392,1760086405,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/4a9b93d4-9878-4023-8dd2-40fcf9502a49.mp3 +11/10/2025,M1,201611955,65789335,1.88401E+18,1760163939,1760163946,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/6fb4f49d-fe9c-4a64-8c74-b5649d7e8175.mp3 +10/10/2025,M1,201459093,66318002,1.61088E+18,1760087527,1760087538,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/8a608622-e3d7-4a96-89ac-1300b7653c6f.mp3 +10/10/2025,M2,201418416,63100145,1.85044E+18,1760078077,1760078083,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/f6ab0a88-2055-4b88-9fbd-7af4aef5731f.mp3 +11/10/2025,M3,201537163,61356706,2.04189E+18,1760147448,1760147453,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/f1f6ad6a-7399-4fb2-8073-c83ef7093b6e.mp3 +10/10/2025,M3,201480897,61752670,1.99088E+18,1760093653,1760093662,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/cdf7552c-a3a8-4cfc-a4b1-651f84141090.mp3 +11/10/2025,M2,201605821,63901708,2.06357E+18,1760162987,1760162993,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/70b3816a-2e43-403a-97e0-dd3288596c71.mp3 +10/10/2025,M3,201457652,61356706,2.04189E+18,1760087292,1760087299,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/d5f5598a-b93c-4fb8-8bb1-56acc9bb0033.mp3 +10/10/2025,M2,201480118,64077815,1.99009E+18,1760093518,1760093526,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/8cac232e-2ed4-4506-8bcf-2a8e4bca91b5.mp3 +11/10/2025,M1,201551882,65617862,1.65598E+18,1760150016,1760150024,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/e71c8472-c3f7-486e-a388-1196836899bf.mp3 +10/10/2025,M5,201417451,58993884,1.18898E+16,1760077893,1760077905,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/2965b989-cf94-4dc4-afaa-5fda4ac4bb34.mp3 +11/10/2025,M5,201547506,58539902,1.92196E+18,1760149254,1760149265,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/1cb5b911-69c9-4b5a-ab2f-e4bad61dd0be.mp3 +11/10/2025,M1,201606566,66579640,1.92316E+18,1760163098,1760163103,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/2e5ff340-8197-4069-8533-0d47a221dc57.mp3 +11/10/2025,M2,201545849,63976411,2.01076E+18,1760148993,1760148999,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/e7592b1f-d24d-4fb5-afb9-0d64f23719d5.mp3 +10/10/2025,M1,201487535,66304049,1.96729E+18,1760095749,1760095754,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/03b31201-d6f7-4246-80ef-ec902f93e6bf.mp3 +10/10/2025,M1,201458971,66590224,2.06508E+18,1760087508,1760087515,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/01013ca6-1659-4613-8ae7-5d79372d4464.mp3 +11/10/2025,M4,201548032,59720355,1.54183E+18,1760149338,1760149343,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/61bafa17-b411-4d1a-b9bd-1a76c5087b9e.mp3 +10/10/2025,M5,201430001,57789932,1.99701E+18,1760080762,1760080768,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/aa25ffd1-82a2-4048-938d-0adc335cec41.mp3 +11/10/2025,M1,201596095,66001014,1.94845E+18,1760161430,1760161436,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/76c67081-af66-4a80-b531-25b14a8e0443.mp3 +11/10/2025,M3,201549933,62873165,1.87383E+18,1760149739,1760149747,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/fd0caad7-72fd-4152-a8ed-58253e946567.mp3 +10/10/2025,M5,201447596,58417708,1.9626E+18,1760085087,1760085092,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/0f8f64c4-2587-4501-abb8-23e0e5389635.mp3 +11/10/2025,M1,201596157,65991243,1.88004E+18,1760161446,1760161452,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/8f731992-cbc6-45e1-8203-5628d51a6e45.mp3 +10/10/2025,M2,201391828,63462209,1.1835E+16,1760067513,1760067535,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/2c641568-ebf6-4989-abeb-e2bc404d79e8.mp3 +11/10/2025,M4,201579066,60241526,1.95443E+18,1760157061,1760157070,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/34c79266-7c59-4e26-b936-05d64716fa79.mp3 +11/10/2025,M3,201539123,61552513,1.65655E+18,1760147755,1760147762,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/1dc2d33a-f080-4cb4-ad66-fb50761b500a.mp3 +11/10/2025,M5,201607636,57899370,1.93241E+18,1760163267,1760163287,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/d8e13804-692a-4cf7-b4e2-781384a1d559.mp3 +10/10/2025,M4,201426209,60181850,1.79511E+18,1760080075,1760080081,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/8f81c559-7392-46ff-8ebb-1a6edc41381c.mp3 +11/10/2025,M1,201535197,66655594,1.66159E+18,1760147133,1760147139,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/d793a0e3-bd7b-44f3-a1e2-bc1eb0908c1a.mp3 +11/10/2025,M4,201613127,61191667,1.78852E+18,1760164234,1760164240,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/4e794fb2-01bf-41ff-843f-945b2b1ec9df.mp3 +10/10/2025,M3,201456582,61353852,1.70556E+18,1760087088,1760087093,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/5fe67f28-5f86-43bb-a87f-589b75118d56.mp3 +11/10/2025,M2,201536974,63373730,2.02636E+18,1760147420,1760147433,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/cf2bf8cc-e2af-478a-96e4-686c59a98d4d.mp3 +11/10/2025,M6,201598303,57270639,1.57805E+18,1760161833,1760161850,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/07e2de01-8040-4942-8586-77d6eb38b64f.mp3 +11/10/2025,M4,201577614,60545450,1.95248E+18,1760156607,1760156635,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/53f6d6eb-c22e-4935-9d3c-941969a0241f.mp3 +10/10/2025,M1,201451460,66451819,1406890,1760085972,1760085978,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251010/21962/43061915-f8e0-405b-9be0-bd3826d0aa69.mp3 +11/10/2025,M5,201566778,57480954,1.79569E+18,1760154035,1760154041,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/f737520a-d62f-4c71-a81c-bfbcbd2887d3.mp3 +11/10/2025,M5,201579474,58000396,1.17763E+16,1760157205,1760157215,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/b8d5a9b1-13cf-4667-8271-1b562de5dd22.mp3 +11/10/2025,M1,201547564,66391023,987018,1760149263,1760149269,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/434cea47-98a9-4f24-b29e-bee36e9f9832.mp3 +11/10/2025,M5,201538978,57340689,1.62497E+18,1760147736,1760147755,https://idn1.obs.ap-southeast-4.myhuaweicloud.com/20251011/21962/a74e253e-34cf-42df-9c3c-c8f6a45b994e.mp3 diff --git a/examples/download_wav/step_1_download_wav.py b/examples/download_wav/step_1_download_wav.py index 99e77cc73bc39d3a44612b5d8c3ef23020c098f7..7e94eb44c0109a331c6172b56a7ccda4cc74b80a 100644 --- a/examples/download_wav/step_1_download_wav.py +++ b/examples/download_wav/step_1_download_wav.py @@ -31,7 +31,7 @@ def get_args(): ) parser.add_argument( "--output_dir", - default=(project_path / "data/calling/358/wav_2ch").as_posix(), + default=(project_path / "data/calling/62/wav_2ch").as_posix(), type=str ) args = parser.parse_args() @@ -39,37 +39,7 @@ def get_args(): excel_file_str = """ -AIAgent-CallLog-20250929100824.xlsx -AIAgent-CallLog-20250929134959.xlsx -AIAgent-CallLog-20250929135030.xlsx -AIAgent-CallLog-20250929135052.xlsx -AIAgent-CallLog-20250929135122.xlsx -AIAgent-CallLog-20250929135134.xlsx -AIAgent-CallLog-20250929135209.xlsx -AIAgent-CallLog-20250929135219.xlsx -AIAgent-CallLog-20250929135247.xlsx -AIAgent-CallLog-20250929135300.xlsx -AIAgent-CallLog-20250929135311.xlsx -AIAgent-CallLog-20250929135335.xlsx -AIAgent-CallLog-20250929135344.xlsx -AIAgent-CallLog-20250929135355.xlsx -AIAgent-CallLog-20250929135443.xlsx -AIAgent-CallLog-20250929135452.xlsx -AIAgent-CallLog-20250929135501.xlsx -AIAgent-CallLog-20250929135537.xlsx -AIAgent-CallLog-20250929135544.xlsx -AIAgent-CallLog-20250929135554.xlsx -AIAgent-CallLog-20250929135630.xlsx -AIAgent-CallLog-20250929135701.xlsx -AIAgent-CallLog-20250929135710.xlsx -AIAgent-CallLog-20250929135716.xlsx -AIAgent-CallLog-20250929135755.xlsx -AIAgent-CallLog-20250929135800.xlsx -AIAgent-CallLog-20250929135809.xlsx -AIAgent-CallLog-20250929135842.xlsx -AIAgent-CallLog-20250929135849.xlsx -AIAgent-CallLog-20250929135858.xlsx -AIAgent-CallLog-20250929135909.xlsx +Temp Query 5_20251008-093912.csv """ @@ -101,11 +71,16 @@ def main(): continue excel_file = excel_file_dir / name - df = pd.read_excel(excel_file.as_posix()) + # df = pd.read_excel(excel_file.as_posix()) + df = pd.read_csv(excel_file.as_posix()) for i, row in tqdm(df.iterrows()): - call_date = row["Attempt time"] - call_id = row["Call ID"] - record_url = row["Recording file"] + call_date = "2025-10-12 00:00:00" + record_url = row["thirdpart_download_url"] + call_id = Path(record_url).stem + + # call_date = row["Attempt time"] + # call_id = row["Call ID"] + # record_url = row["Recording file"] if pd.isna(record_url): continue @@ -137,7 +112,7 @@ def main(): if resp.status_code != 200: raise AssertionError("status_code: {}; text: {}".format(resp.status_code, resp.text)) - filename = output_dir / "{}.wav".format(call_id) + filename = output_dir / "{}.mp3".format(call_id) with open(filename.as_posix(), "wb") as f: f.write(resp.content) diff --git a/examples/download_wav/step_2_to_1ch.py b/examples/download_wav/step_2_to_1ch.py index 3d839178e1e6b1d18cf0cf934548ff2944097524..b938d7eb602af7a2e4390d216a80cf50ab2531f9 100644 --- a/examples/download_wav/step_2_to_1ch.py +++ b/examples/download_wav/step_2_to_1ch.py @@ -3,8 +3,9 @@ import argparse import os from pathlib import Path -import time +import librosa +import numpy as np from scipy.io import wavfile from tqdm import tqdm @@ -16,12 +17,12 @@ def get_args(): parser.add_argument( "--audio_dir", - default=(project_path / "data/calling/358/wav_2ch").as_posix(), + default=(project_path / "data/calling/62/wav_2ch").as_posix(), type=str ) parser.add_argument( "--output_dir", - default=(project_path / "data/calling/358/wav_1ch").as_posix(), + default=(project_path / "data/calling/62/wav_1ch").as_posix(), type=str ) args = parser.parse_args() @@ -36,13 +37,13 @@ def main(): output_dir.mkdir(parents=True, exist_ok=True) finished = set() - for filename in tqdm(list(output_dir.glob("*.wav"))): + for filename in tqdm(list(output_dir.glob("*.mp3"))): splits = filename.stem.split("_") call_id = splits[3] finished.add(call_id) print(f"finished count: {len(finished)}") - for filename in tqdm(list(audio_dir.glob("*.wav"))): + for filename in tqdm(list(audio_dir.glob("*.mp3"))): call_id = filename.stem if call_id in finished: @@ -51,16 +52,19 @@ def main(): finished.add(call_id) try: - sample_rate, signal = wavfile.read(filename.as_posix()) + # sample_rate, signal = wavfile.read(filename.as_posix()) + signal, sample_rate = librosa.load(filename.as_posix(), sr=8000, mono=False) + signal = np.array(signal * (1 << 15), dtype=np.int16) except UnboundLocalError as error: print(f"wavfile read failed. error type: {type(error)}, text: {str(error)}, filename: {filename.as_posix()}") raise error if sample_rate != 8000: raise AssertionError - signal = signal[:, 0] + # signal = signal[:, 0] + signal = signal[0, :] - to_filename = output_dir / f"active_media_r_{call_id}_fi-FI_none.wav" + to_filename = output_dir / f"active_media_r_{call_id}_id-ID_none.wav" try: wavfile.write( to_filename.as_posix(), diff --git a/examples/download_wav/step_3_split_two_second_wav.py b/examples/download_wav/step_3_split_two_second_wav.py index b7dee18fff782d7e2061e0ab7bc7725d940b6045..ac2cc043f5c95033d0799c8c7723d3ab0dd0ba5c 100644 --- a/examples/download_wav/step_3_split_two_second_wav.py +++ b/examples/download_wav/step_3_split_two_second_wav.py @@ -15,17 +15,21 @@ def get_args(): parser.add_argument( "--audio_dir", - default=(project_path / "data/calling/358/wav_2ch").as_posix(), + # default=(project_path / "data/calling/66/wav_1ch").as_posix(), + # default=(project_path / "data/calling/358/wav_1ch/finished/voicemail_annotation").as_posix(), + # default=(project_path / "data/calling/358/wav_1ch/finished/voicemail_annotation").as_posix(), + default=r"D:\Users\tianx\HuggingSpaces\template_match_asr\data\wav\early_media\52\music", type=str ) parser.add_argument( "--output_dir", - default=(project_path / "data/calling/358/wav_segmented").as_posix(), + # default=(project_path / "data/calling/358/wav_segmented").as_posix(), + default=r"D:\Users\tianx\HuggingSpaces\template_match_asr\data\wav\early_media\52\music\wav_segmented", type=str ) parser.add_argument( "--first_n_seconds", - default=8, + default=1000, type=int ) args = parser.parse_args() @@ -40,12 +44,16 @@ def main(): output_dir.mkdir(parents=True, exist_ok=True) for filename in tqdm(list(audio_dir.glob("*.wav"))): - call_id = filename.stem + splits = filename.stem.split("_") + call_id = splits[3] + language = splits[4] + scene_id = splits[5] + sample_rate, signal = wavfile.read(filename.as_posix()) if sample_rate != 8000: raise AssertionError - signal = signal[:, 0] + # signal = signal[:, 0] signal_length = len(signal) - sample_rate * 2 if signal_length <= 0: continue @@ -56,8 +64,7 @@ def main(): end = begin + sample_rate * 2 sub_signal = signal[begin: end] - ts = int(time.time() * 1000) - to_filename = output_dir / "{}_fi-FI_none_{}.wav".format(call_id, ts) + to_filename = output_dir / f"active_media_r_{call_id}_{language}_{scene_id}_{begin}.wav" wavfile.write( to_filename.as_posix(), sample_rate, diff --git a/examples/lstm_badcase_filter/step_1_badcase_filter.py b/examples/lstm_badcase_filter/step_1_badcase_filter.py new file mode 100644 index 0000000000000000000000000000000000000000..0c02593022eb8cb5924e7b3c8145b304b04dfe00 --- /dev/null +++ b/examples/lstm_badcase_filter/step_1_badcase_filter.py @@ -0,0 +1,233 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import argparse +from pathlib import Path + +from tqdm import tqdm + +from gradio_client import Client, handle_file +import librosa +import numpy as np +import onnxruntime as ort +from scipy.io import wavfile +import torch +import torchaudio +import shutil + +from project_settings import project_path + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--audio_dir", + # default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\sea-idn\audio_lib_hkg_1\audio_lib_hkg_1\pt-BR2", + # default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\sea-idn\audio_lib_hkg_1\audio_lib_hkg_1\pt-BR", + default=r"D:\Users\tianx\HuggingDatasets\calling_analysis\data\pt-BR\bell_and_di_then_mute", + type=str, + ) + parser.add_argument( + "--onnx_model_file", + # default=(project_path / "examples/online_model_test/models/pt-BR.onnx").as_posix(), + default="../online_model_test/models/pt-BR.onnx", + type=str + ) + parser.add_argument( + "--output_dir", + default=(project_path / "data/badcase").as_posix(), + type=str, + ) + args = parser.parse_args() + return args + + +class OnlineModelConfig(object): + def __init__(self, + sample_rate: int = 8000, + n_fft: int = 1024, + hop_size: int = 512, + n_mels: int = 80, + f_min: float = 10.0, + f_max: float = 3800.0, + ): + self.sample_rate = sample_rate + self.n_fft = n_fft + self.hop_size = hop_size + self.n_mels = n_mels + self.f_min = f_min + self.f_max = f_max + + +class OnlineModelInference(object): + def __init__(self, + model_path: str, + ): + self.model_path = model_path + + providers = [ + "CUDAExecutionProvider", "CPUExecutionProvider" + ] if torch.cuda.is_available() else [ + "CPUExecutionProvider" + ] + self.session = ort.InferenceSession(self.model_path, providers=providers) + + self.config = OnlineModelConfig() + + self.mel_transform = torchaudio.transforms.MelSpectrogram( + sample_rate=self.config.sample_rate, + n_fft=self.config.n_fft, + hop_length=self.config.hop_size, + n_mels=self.config.n_mels, + f_min=self.config.f_min, + f_max=self.config.f_max, + window_fn=torch.hamming_window + ) + + def predict_by_ndarray(self, + sub_signal: np.ndarray, + h: np.ndarray = None, + c: np.ndarray = None, + ): + # sub_signal, shape: [num_samples,] + sub_signal = torch.tensor(sub_signal, dtype=torch.float32) + + sub_signal = sub_signal.unsqueeze(0) + # sub_signal, shape: [1, num_samples] + mel_spec = self.mel_transform.forward(sub_signal) + # mel_spec, shape: [1, n_mels, n_frames] + mel_spec = torch.transpose(mel_spec, dim0=1, dim1=2) + # mel_spec, shape: [1, n_frames, n_mels] + + h = torch.tensor(h) if h is not None else None + c = torch.tensor(c) if h is not None else None + label, prob, h, c = self.predict_by_mel_spec(mel_spec, h=h, c=c) + # h, c: torch.Tensor + h = h.numpy() + c = c.numpy() + return label, prob, h, c + + def predict_by_mel_spec(self, + mel_spec: torch.Tensor, + h: torch.Tensor = None, + c: torch.Tensor = None, + ): + # mel_spec, shape: [1, n_frames, n_mels] + + if h is None: + h = np.zeros((3, 1, 64), dtype=np.float32) # 3层LSTM,批次大小1,隐藏大小64 + else: + h = h.numpy() + if c is None: + c = np.zeros((3, 1, 64), dtype=np.float32) # 3层LSTM,批次大小1,隐藏大小64 + else: + c = c.numpy() + + mel_spec_np = mel_spec.numpy() + outputs = self.session.run( + input_feed={ + "input": mel_spec_np, + "h": h, + "c": c + }, + output_names=[ + "output", "h_out", "c_out" + ], + ) + logits, h, c = outputs + # logits, np.ndarray, shape: [b, num_labels] + # h, c: np.ndarray + h = torch.tensor(h) + c = torch.tensor(c) + + probs = torch.softmax(torch.tensor(logits), dim=1) + max_prob, predicted_label_index = torch.max(probs, dim=1) + + label = self.get_label_by_index(predicted_label_index.item()) + prob = max_prob.item() + return label, prob, h, c + + @staticmethod + def get_label_by_index(index: int): + label_map = { + 0: "voice", + 1: "voicemail", + 2: "mute", + 3: "noise" + } + result = label_map[index] + return result + + +def main(): + args = get_args() + + client = Client("http://127.0.0.1:7864/") + # client = Client("http://10.75.27.247:7864/") + + audio_dir = Path(args.audio_dir) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + model = OnlineModelInference(model_path=args.onnx_model_file) + + for filename in tqdm(audio_dir.glob("**/active_media_r_*.wav")): + splits = filename.stem.split("_") + call_id = splits[3] + language = splits[4] + scene_id = splits[5] + + signal, sample_rate = librosa.load(filename.as_posix(), sr=8000) + duration = librosa.get_duration(y=signal, sr=sample_rate) + signal_length = len(signal) + if signal_length == 0: + continue + + begin = 0 + end = begin + sample_rate * 2 + sub_signal = signal[begin: end] + if sub_signal.shape[0] != 16000: + continue + + h = None + c = None + label1, prob1, h, c = model.predict_by_ndarray(sub_signal, h=h, c=c) + + sub_signal_ = np.array(sub_signal * (1 << 15), dtype=np.int16) + temp_file = "temp.wav" + + wavfile.write( + temp_file, + 8000, + sub_signal_, + ) + + # label2, prob2 = client.predict( + # audio_t=handle_file(temp_file), + # model_name="voicemail-pt-br-2-ch4", + # ground_true="Hello!!", + # api_name="/when_click_cls_button" + # ) + label2, prob2 = client.predict( + audio_t=handle_file(temp_file), + model_name="sound-8-ch4", + ground_true="Hello!!", + api_name="/when_click_cls_button" + ) + + print(label1) + print(label2) + label2 = "voicemail" + label1 = "non_voicemail" + if label2 in ("voicemail", "bell") and label1 != "voicemail": + tgt_file = output_dir / f"active_media_r_{call_id}_{language}_{scene_id}_0.wav" + if not tgt_file.exists(): + shutil.move( + temp_file, + tgt_file.as_posix(), + ) + + return + + +if __name__ == "__main__": + main() diff --git a/examples/online_model_test/step_1_predict.py b/examples/online_model_test/step_1_predict.py index f62470b47fc35c6aeb6018eeb4a3fd40593ad8e6..26e4478fe253db80f0397437c6cc7cf927f03c15 100644 --- a/examples/online_model_test/step_1_predict.py +++ b/examples/online_model_test/step_1_predict.py @@ -19,13 +19,13 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--audio_dir", - default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\sea-idn\audio_lib_hkg_1\audio_lib_hkg_1\zh-TW", + default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\sea-idn\audio_lib_hkg_1\audio_lib_hkg_1\th-TH\th-TH\early_media_no_voice", type=str, ) - parser.add_argument("--onnx_model_file", default="zh-TW.onnx", type=str) + parser.add_argument("--onnx_model_file", default="models/th-TH.onnx", type=str) parser.add_argument("--target_duration", default=8.0, type=float) - parser.add_argument("--output_file", default="zh_tw_predict.xlsx", type=str) + parser.add_argument("--output_file", default="th-TH_predict.xlsx", type=str) args = parser.parse_args() return args @@ -177,13 +177,15 @@ def main(): for begin in range(0, target_duration, sample_rate*2): end = begin + sample_rate*2 sub_signal = signal[begin: end] - if len(sub_signal) == 0: + if len(sub_signal) < 0.5 * sample_rate: break label, prob, h, c = model.predict_by_ndarray(sub_signal, h=h, c=c) predict_result.append({ "label": label, "prob": prob, }) + if len(predict_result) == 0: + continue label_list = [p["label"] for p in predict_result] predict_result_ = json.dumps(predict_result, ensure_ascii=False, indent=4) label2 = predict_result[0]["label"] diff --git a/examples/online_model_test/step_2_audio_filter.py b/examples/online_model_test/step_2_audio_filter.py index 9c5dfbd44696859a6a5a6d3435e77890b87c397a..28f00e742cf97ea474084f3e7545d8e30f6620c2 100644 --- a/examples/online_model_test/step_2_audio_filter.py +++ b/examples/online_model_test/step_2_audio_filter.py @@ -10,10 +10,10 @@ import pandas as pd def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--predict_file", default="zh_tw_predict.xlsx", type=str) + parser.add_argument("--predict_file", default="th-TH_predict.xlsx", type=str) parser.add_argument( "--output_dir", - default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\calling\886", + default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\sea-idn\audio_lib_hkg_1\audio_lib_hkg_1\th-TH\th-TH\early_media_no_voice\bad_case", type=str, ) args = parser.parse_args() @@ -24,12 +24,16 @@ def main(): args = get_args() output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) df = pd.read_excel(args.predict_file) for i, row in df.iterrows(): filename = row["filename"] ground_truth_ = row["ground_truth_"] + flag = row["flag"] + if flag == 1: + continue if ground_truth_ == "voicemail": shutil.copy( filename, diff --git a/examples/online_model_test/step_3_make_test.py b/examples/online_model_test/step_3_make_test.py index ab8e3941721e093f53b149ebd06d99d5661e6276..d25e8d363c20e4e75941b141371663adc29248ec 100644 --- a/examples/online_model_test/step_3_make_test.py +++ b/examples/online_model_test/step_3_make_test.py @@ -15,12 +15,12 @@ def get_args(): parser.add_argument( "--src_dir", - default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\calling\65\voicemail", + default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\calling\63\voicemail", type=str, ) parser.add_argument( "--tgt_dir", - default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\voice_test_examples\65\95", + default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\voice_test_examples\63\96", type=str, ) parser.add_argument( diff --git a/examples/online_model_test/test.py b/examples/online_model_test/test.py new file mode 100644 index 0000000000000000000000000000000000000000..38f6168c1b152a55cd25d4d83007f762bfeff3ca --- /dev/null +++ b/examples/online_model_test/test.py @@ -0,0 +1,84 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import argparse +from collections import defaultdict +from pathlib import Path +import shutil + +from gradio_client import Client, handle_file +import librosa +import pandas as pd +from tqdm import tqdm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--finished_dir", + default=r"D:\Users\tianx\HuggingSpaces\cc_audio_8\data\calling\66\wav_1ch", + type=str, + ) + parser.add_argument( + "--src_dir", + default=r"D:/Users/tianx/HuggingDatasets/international_voice/data/sea-idn/audio_lib_hkg_1/audio_lib_hkg_1/th-TH/th-TH/", + type=str, + ) + parser.add_argument( + "--tgt_dir", + default=r"D:\Users\tianx\HuggingDatasets\international_voice\data\sea-idn\audio_lib_hkg_1\audio_lib_hkg_1\th-TH\bad_case", + type=str, + ) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + + finished_dir = Path(args.finished_dir) + src_dir = Path(args.src_dir) + tgt_dir = Path(args.tgt_dir) + tgt_dir.mkdir(parents=True, exist_ok=True) + + # finished + finished = set() + for filename in finished_dir.glob("*.wav"): + splits = filename.stem.split("_") + call_id = splits[3] + if call_id in ("27521940-feef-4bfa-ba55-b1f00a10c64d",): + print(f"call_id: {call_id}") + + finished.add(call_id) + print(f"finished count: {len(finished)}") + + # call_id_to_wav_file_list + call_id_to_wav_file_list = defaultdict(list) + for filename in src_dir.glob("**/*.wav"): + splits = filename.stem.split("_") + call_id = splits[3] + language = splits[4] + scene_id = splits[5] + if call_id in ("27521940-feef-4bfa-ba55-b1f00a10c64d",): + print(f"call_id: {call_id}") + + call_id_to_wav_file_list[call_id].append(filename.as_posix()) + print(f"src count: {len(call_id_to_wav_file_list)}") + + for filename in tqdm(src_dir.glob("**/active_media_r_*.wav")): + splits = filename.stem.split("_") + call_id = splits[3] + if call_id in ("27521940-feef-4bfa-ba55-b1f00a10c64d",): + print(f"call_id: {call_id}") + + if call_id in finished: + wav_file_list = call_id_to_wav_file_list[call_id] + for wav_file in wav_file_list: + shutil.move( + wav_file, + tgt_dir.as_posix(), + ) + return + + +if __name__ == "__main__": + main() diff --git a/examples/vm_sound_classification/requirements.txt b/examples/sound_classification_by_cnn/requirements.txt similarity index 100% rename from examples/vm_sound_classification/requirements.txt rename to examples/sound_classification_by_cnn/requirements.txt diff --git a/examples/vm_sound_classification/run.sh b/examples/sound_classification_by_cnn/run.sh similarity index 96% rename from examples/vm_sound_classification/run.sh rename to examples/sound_classification_by_cnn/run.sh index 69239dd47caf5249b38bba11516b275ac956395c..313ab68ddc75e9b74b17408405fa8d79072ec949 100644 --- a/examples/vm_sound_classification/run.sh +++ b/examples/sound_classification_by_cnn/run.sh @@ -2,22 +2,22 @@ : <<'END' -sh run.sh --stage 0 --stop_stage 1 --system_version windows --file_folder_name file_dir --final_model_name sound-4-ch32 \ +sh run.sh --stage 0 --stop_stage 1 --system_version windows --file_folder_name file_dir --final_model_name sound-4-ch32-cnn \ --filename_patterns "E:/Users/tianx/HuggingDatasets/cc_audio_8/data/wav_finished/wav_finished/en-US/wav_finished/*/*.wav \ E:/Users/tianx/HuggingDatasets/cc_audio_8/data/wav_finished/id-ID/wav_finished/*/*.wav" \ --label_plan 4 -sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir --final_model_name sound-2-ch32 \ +sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir --final_model_name sound-2-ch32-cnn \ --filename_patterns "E:/Users/tianx/HuggingDatasets/cc_audio_8/data/wav_finished/wav_finished/en-US/wav_finished/*/*.wav \ E:/Users/tianx/HuggingDatasets/cc_audio_8/data/wav_finished/id-ID/wav_finished/*/*.wav" \ --label_plan 4 -sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch32 \ +sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch32-cnn \ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ --label_plan 3 \ --config_file "yaml/conv2d-classifier-3-ch4.yaml" -sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ms-my-2-ch32 \ +sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ms-my-2-ch32-cnn \ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ms-MY/wav_finished/*/*.wav" \ --label_plan 2-voicemail \ --config_file "yaml/conv2d-classifier-2-ch32.yaml" diff --git a/examples/vm_sound_classification/run_batch.sh b/examples/sound_classification_by_cnn/run_batch.sh similarity index 85% rename from examples/vm_sound_classification/run_batch.sh rename to examples/sound_classification_by_cnn/run_batch.sh index f1544816df98b3f1b863cd9f4b1afbaee6465b0d..d47de7e577f04689c850142b9c91ae8e2d5f5458 100644 --- a/examples/vm_sound_classification/run_batch.sh +++ b/examples/sound_classification_by_cnn/run_batch.sh @@ -3,25 +3,25 @@ # sound ch4 -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 2 \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" # # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 3 \ #--config_file "yaml/conv2d-classifier-3-ch4.yaml" # # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-4-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-4-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 4 \ #--config_file "yaml/conv2d-classifier-4-ch4.yaml" # # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-8-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-8-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 8 \ #--config_file "yaml/conv2d-classifier-8-ch4.yaml" @@ -29,25 +29,25 @@ # sound ch8 -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-2-ch8 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-2-ch8-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 2 \ #--config_file "yaml/conv2d-classifier-2-ch8.yaml" # # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch8 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch8-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 3 \ #--config_file "yaml/conv2d-classifier-3-ch8.yaml" # # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-4-ch8 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-4-ch8-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 4 \ #--config_file "yaml/conv2d-classifier-4-ch8.yaml" # # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-8-ch8 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-8-ch8-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 8 \ #--config_file "yaml/conv2d-classifier-8-ch8.yaml" @@ -55,25 +55,25 @@ # sound ch16 -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-2-ch16 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-2-ch16-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 2 \ #--config_file "yaml/conv2d-classifier-2-ch16.yaml" -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch16 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch16-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 3 \ #--config_file "yaml/conv2d-classifier-3-ch16.yaml" # # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-4-ch16 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-4-ch16-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 4 \ #--config_file "yaml/conv2d-classifier-4-ch16.yaml" # # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-8-ch16 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-8-ch16-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 8 \ #--config_file "yaml/conv2d-classifier-8-ch16.yaml" @@ -81,25 +81,25 @@ # sound ch32 -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 2 \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" # # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 3 \ #--config_file "yaml/conv2d-classifier-3-ch32.yaml" # # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-4-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-4-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 4 \ #--config_file "yaml/conv2d-classifier-4-ch32.yaml" -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-8-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-8-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ #--label_plan 8 \ #--config_file "yaml/conv2d-classifier-8-ch32.yaml" @@ -107,12 +107,12 @@ # pretrained voicemail -sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-2-ch4 \ +sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-2-ch4-cnn \ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ --label_plan 2-voicemail \ --config_file "yaml/conv2d-classifier-2-ch4.yaml" -sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-2-ch32 \ +sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-2-ch32-cnn \ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ --label_plan 2-voicemail \ --config_file "yaml/conv2d-classifier-2-ch32.yaml" @@ -120,149 +120,149 @@ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name fi # voicemail ch4 -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-ph-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-ph-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/en-PH/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-sg-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-sg-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/en-SG/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-us-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-us-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/en-US/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-es-mx-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-es-mx-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/es-MX/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-es-pe-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-es-pe-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/es-PE/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" # -sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-id-id-2-ch4 \ +sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-id-id-2-ch4-cnn \ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/id-ID/wav_finished/*/*.wav" \ --label_plan 2-voicemail \ --config_file "yaml/conv2d-classifier-2-ch4.yaml" \ ---pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ja-jp-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ja-jp-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ja-JP/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ko-kr-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ko-kr-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ko-KR/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ms-my-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ms-my-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ms-MY/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-pt-br-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-pt-br-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/pt-BR/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-th-th-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-th-th-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/th-TH/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-zh-tw-2-ch4 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-zh-tw-2-ch4-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/zh-TW/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch4.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch4-cnn.zip" # voicemail ch32 -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-ph-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-ph-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/en-PH/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-sg-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-sg-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/en-SG/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-us-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-us-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/en-US/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-es-mx-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-es-mx-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/es-MX/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-es-pe-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-es-pe-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/es-PE/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" # -sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-id-id-2-ch32 \ +sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-id-id-2-ch32-cnn \ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/id-ID/wav_finished/*/*.wav" \ --label_plan 2-voicemail \ --config_file "yaml/conv2d-classifier-2-ch32.yaml" \ ---pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ja-jp-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ja-jp-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ja-JP/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ko-kr-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ko-kr-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ko-KR/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ms-my-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ms-my-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ms-MY/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-pt-br-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-pt-br-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/pt-BR/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-th-th-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-th-th-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/th-TH/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" # -#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-zh-tw-2-ch32 \ +#sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-zh-tw-2-ch32-cnn \ #--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/zh-TW/wav_finished/*/*.wav" \ #--label_plan 2-voicemail \ #--config_file "yaml/conv2d-classifier-2-ch32.yaml" \ -#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32.zip" +#--pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch32-cnn.zip" diff --git a/examples/vm_sound_classification/step_1_prepare_data.py b/examples/sound_classification_by_cnn/step_1_prepare_data.py similarity index 100% rename from examples/vm_sound_classification/step_1_prepare_data.py rename to examples/sound_classification_by_cnn/step_1_prepare_data.py diff --git a/examples/vm_sound_classification/step_2_make_vocabulary.py b/examples/sound_classification_by_cnn/step_2_make_vocabulary.py similarity index 100% rename from examples/vm_sound_classification/step_2_make_vocabulary.py rename to examples/sound_classification_by_cnn/step_2_make_vocabulary.py diff --git a/examples/vm_sound_classification/step_3_train_model.py b/examples/sound_classification_by_cnn/step_3_train_model.py similarity index 99% rename from examples/vm_sound_classification/step_3_train_model.py rename to examples/sound_classification_by_cnn/step_3_train_model.py index e2154ade0f0d50e6e6737b517a96fc9ced8b27bb..f0df54fb5cc5fa34c8914920394f43639956c926 100644 --- a/examples/vm_sound_classification/step_3_train_model.py +++ b/examples/sound_classification_by_cnn/step_3_train_model.py @@ -50,7 +50,7 @@ def get_args(): parser.add_argument("--config_file", default="conv2d_classifier.yaml", type=str) parser.add_argument( "--pretrained_model", - # default=(project_path / "trained_models/voicemail-en-sg-2-ch4.zip").as_posix(), + # default=(project_path / "trained_models/voicemail-en-sg-2-ch4-cnn.zip").as_posix(), default="null", type=str ) diff --git a/examples/vm_sound_classification/step_4_evaluation_model.py b/examples/sound_classification_by_cnn/step_4_evaluation_model.py similarity index 100% rename from examples/vm_sound_classification/step_4_evaluation_model.py rename to examples/sound_classification_by_cnn/step_4_evaluation_model.py diff --git a/examples/vm_sound_classification/step_5_export_models.py b/examples/sound_classification_by_cnn/step_5_export_models.py similarity index 100% rename from examples/vm_sound_classification/step_5_export_models.py rename to examples/sound_classification_by_cnn/step_5_export_models.py diff --git a/examples/vm_sound_classification/step_6_infer.py b/examples/sound_classification_by_cnn/step_6_infer.py similarity index 100% rename from examples/vm_sound_classification/step_6_infer.py rename to examples/sound_classification_by_cnn/step_6_infer.py diff --git a/examples/vm_sound_classification/step_7_test_model.py b/examples/sound_classification_by_cnn/step_7_test_model.py similarity index 100% rename from examples/vm_sound_classification/step_7_test_model.py rename to examples/sound_classification_by_cnn/step_7_test_model.py diff --git a/examples/vm_sound_classification/stop.sh b/examples/sound_classification_by_cnn/stop.sh similarity index 100% rename from examples/vm_sound_classification/stop.sh rename to examples/sound_classification_by_cnn/stop.sh diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-2-ch16.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-2-ch16.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-2-ch16.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-2-ch16.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-2-ch32.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-2-ch32.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-2-ch32.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-2-ch32.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-2-ch4.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-2-ch4.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-2-ch4.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-2-ch4.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-2-ch8.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-2-ch8.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-2-ch8.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-2-ch8.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-3-ch16.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-3-ch16.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-3-ch16.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-3-ch16.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-3-ch32.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-3-ch32.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-3-ch32.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-3-ch32.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-3-ch4.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-3-ch4.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-3-ch4.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-3-ch4.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-3-ch8.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-3-ch8.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-3-ch8.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-3-ch8.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-4-ch16.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-4-ch16.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-4-ch16.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-4-ch16.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-4-ch32.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-4-ch32.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-4-ch32.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-4-ch32.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-4-ch4.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-4-ch4.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-4-ch4.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-4-ch4.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-4-ch8.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-4-ch8.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-4-ch8.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-4-ch8.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-8-ch16.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-8-ch16.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-8-ch16.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-8-ch16.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-8-ch32.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-8-ch32.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-8-ch32.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-8-ch32.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-8-ch4.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-8-ch4.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-8-ch4.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-8-ch4.yaml diff --git a/examples/vm_sound_classification/yaml/conv2d-classifier-8-ch8.yaml b/examples/sound_classification_by_cnn/yaml/conv2d-classifier-8-ch8.yaml similarity index 100% rename from examples/vm_sound_classification/yaml/conv2d-classifier-8-ch8.yaml rename to examples/sound_classification_by_cnn/yaml/conv2d-classifier-8-ch8.yaml diff --git a/examples/vm_sound_classification8/requirements.txt b/examples/sound_classification_by_cnn_union/requirements.txt similarity index 100% rename from examples/vm_sound_classification8/requirements.txt rename to examples/sound_classification_by_cnn_union/requirements.txt diff --git a/examples/vm_sound_classification8/run.sh b/examples/sound_classification_by_cnn_union/run.sh similarity index 100% rename from examples/vm_sound_classification8/run.sh rename to examples/sound_classification_by_cnn_union/run.sh diff --git a/examples/vm_sound_classification8/step_1_prepare_data.py b/examples/sound_classification_by_cnn_union/step_1_prepare_data.py similarity index 100% rename from examples/vm_sound_classification8/step_1_prepare_data.py rename to examples/sound_classification_by_cnn_union/step_1_prepare_data.py diff --git a/examples/vm_sound_classification8/step_2_make_vocabulary.py b/examples/sound_classification_by_cnn_union/step_2_make_vocabulary.py similarity index 100% rename from examples/vm_sound_classification8/step_2_make_vocabulary.py rename to examples/sound_classification_by_cnn_union/step_2_make_vocabulary.py diff --git a/examples/vm_sound_classification8/step_3_train_global_model.py b/examples/sound_classification_by_cnn_union/step_3_train_global_model.py similarity index 100% rename from examples/vm_sound_classification8/step_3_train_global_model.py rename to examples/sound_classification_by_cnn_union/step_3_train_global_model.py diff --git a/examples/vm_sound_classification8/step_4_train_country_model.py b/examples/sound_classification_by_cnn_union/step_4_train_country_model.py similarity index 100% rename from examples/vm_sound_classification8/step_4_train_country_model.py rename to examples/sound_classification_by_cnn_union/step_4_train_country_model.py diff --git a/examples/vm_sound_classification8/step_5_train_union.py b/examples/sound_classification_by_cnn_union/step_5_train_union.py similarity index 100% rename from examples/vm_sound_classification8/step_5_train_union.py rename to examples/sound_classification_by_cnn_union/step_5_train_union.py diff --git a/examples/vm_sound_classification8/stop.sh b/examples/sound_classification_by_cnn_union/stop.sh similarity index 100% rename from examples/vm_sound_classification8/stop.sh rename to examples/sound_classification_by_cnn_union/stop.sh diff --git a/examples/sound_classification_by_lstm/run.sh b/examples/sound_classification_by_lstm/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..a7aa51fbf9702aa958915cf9321ee14ce621f01e --- /dev/null +++ b/examples/sound_classification_by_lstm/run.sh @@ -0,0 +1,197 @@ +#!/usr/bin/env bash + +: <<'END' + +sh run.sh --stage 0 --stop_stage 1 --system_version windows --file_folder_name file_dir --final_model_name sound-4-ch32-lstm \ +--filename_patterns "E:/Users/tianx/HuggingDatasets/cc_audio_8/data/wav_finished/wav_finished/en-US/wav_finished/*/*.wav \ +E:/Users/tianx/HuggingDatasets/cc_audio_8/data/wav_finished/id-ID/wav_finished/*/*.wav" \ +--label_plan 4 + +sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name file_dir --final_model_name sound-2-ch32-lstm \ +--filename_patterns "E:/Users/tianx/HuggingDatasets/cc_audio_8/data/wav_finished/wav_finished/en-US/wav_finished/*/*.wav \ +E:/Users/tianx/HuggingDatasets/cc_audio_8/data/wav_finished/id-ID/wav_finished/*/*.wav" \ +--label_plan 4 + +sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name sound-3-ch32-lstm \ +--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \ +--label_plan 3 \ +--config_file "yaml/lstm_classifier-3-ch64.yaml" + +sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ms-my-2-ch32-lstm \ +--filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ms-MY/wav_finished/*/*.wav" \ +--label_plan 2-voicemail \ +--config_file "yaml/lstm_classifier-2-ch64.yaml" + +END + + +# params +system_version="windows"; +verbose=true; +stage=0 # start from 0 if you need to start from data preparation +stop_stage=9 + +work_dir="$(pwd)" +file_folder_name=file_folder_name +final_model_name=final_model_name +filename_patterns="/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" +label_plan=4 +config_file="yaml/lstm_classifier-4-ch64.yaml" +pretrained_model=null +nohup_name=nohup.out + +country=en-US + +# model params +batch_size=64 +max_epochs=200 +save_top_k=10 +patience=5 + + +# parse options +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g); + eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + old_value="(eval echo \\$$name)"; + if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval "${name}=\"$2\""; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + + *) break; + esac +done + +file_dir="${work_dir}/${file_folder_name}" +final_model_dir="${work_dir}/../../trained_models/${final_model_name}"; + +dataset="${file_dir}/dataset.xlsx" +train_dataset="${file_dir}/train.xlsx" +valid_dataset="${file_dir}/valid.xlsx" +evaluation_file="${file_dir}/evaluation.xlsx" +vocabulary_dir="${file_dir}/vocabulary" + +$verbose && echo "system_version: ${system_version}" +$verbose && echo "file_folder_name: ${file_folder_name}" + +if [ $system_version == "windows" ]; then + alias python3='D:/Users/tianx/PycharmProjects/virtualenv/cc_audio_8/Scripts/python.exe' +elif [ $system_version == "centos" ] || [ $system_version == "ubuntu" ]; then + #source /data/local/bin/cc_audio_8/bin/activate + alias python3='/data/local/bin/cc_audio_8/bin/python3' +fi + + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + $verbose && echo "stage 0: prepare data" + cd "${work_dir}" || exit 1 + python3 step_1_prepare_data.py \ + --file_dir "${file_dir}" \ + --filename_patterns "${filename_patterns}" \ + --train_dataset "${train_dataset}" \ + --valid_dataset "${valid_dataset}" \ + --label_plan "${label_plan}" \ + +fi + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + $verbose && echo "stage 1: make vocabulary" + cd "${work_dir}" || exit 1 + python3 step_2_make_vocabulary.py \ + --vocabulary_dir "${vocabulary_dir}" \ + --train_dataset "${train_dataset}" \ + --valid_dataset "${valid_dataset}" \ + +fi + + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + $verbose && echo "stage 2: train model" + cd "${work_dir}" || exit 1 + python3 step_3_train_model.py \ + --vocabulary_dir "${vocabulary_dir}" \ + --train_dataset "${train_dataset}" \ + --valid_dataset "${valid_dataset}" \ + --serialization_dir "${file_dir}" \ + --config_file "${config_file}" \ + --pretrained_model "${pretrained_model}" \ + +fi + + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + $verbose && echo "stage 3: test model" + cd "${work_dir}" || exit 1 + python3 step_4_evaluation_model.py \ + --dataset "${dataset}" \ + --vocabulary_dir "${vocabulary_dir}" \ + --model_dir "${file_dir}/best" \ + --output_file "${evaluation_file}" \ + +fi + + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + $verbose && echo "stage 4: export model" + cd "${work_dir}" || exit 1 + python3 step_5_export_models.py \ + --vocabulary_dir "${vocabulary_dir}" \ + --model_dir "${file_dir}/best" \ + --serialization_dir "${file_dir}" \ + +fi + + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + $verbose && echo "stage 5: collect files" + cd "${work_dir}" || exit 1 + + mkdir -p ${final_model_dir} + + cp "${file_dir}/best"/* "${final_model_dir}" + cp -r "${file_dir}/vocabulary" "${final_model_dir}" + + cp "${file_dir}/evaluation.xlsx" "${final_model_dir}/evaluation.xlsx" + + cp "${file_dir}/trace_model.zip" "${final_model_dir}/trace_model.zip" + cp "${file_dir}/trace_quant_model.zip" "${final_model_dir}/trace_quant_model.zip" + cp "${file_dir}/script_model.zip" "${final_model_dir}/script_model.zip" + cp "${file_dir}/script_quant_model.zip" "${final_model_dir}/script_quant_model.zip" + + cd "${final_model_dir}/.." || exit 1; + + if [ -e "${final_model_name}.zip" ]; then + rm -rf "${final_model_name}_backup.zip" + mv "${final_model_name}.zip" "${final_model_name}_backup.zip" + fi + + zip -r "${final_model_name}.zip" "${final_model_name}" + rm -rf "${final_model_name}" + +fi + + +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + $verbose && echo "stage 6: clear file_dir" + cd "${work_dir}" || exit 1 + + rm -rf "${file_dir}"; + +fi diff --git a/examples/sound_classification_by_lstm/step_1_prepare_data.py b/examples/sound_classification_by_lstm/step_1_prepare_data.py new file mode 100644 index 0000000000000000000000000000000000000000..6ec981d2f96d45e1267154f3045916e0b42bcfdc --- /dev/null +++ b/examples/sound_classification_by_lstm/step_1_prepare_data.py @@ -0,0 +1,193 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import argparse +from glob import glob +import os +from pathlib import Path +import random +import sys + +pwd = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.join(pwd, "../../")) + +import pandas as pd +from scipy.io import wavfile +from tqdm import tqdm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--file_dir", default="./", type=str) + parser.add_argument("--filename_patterns", type=str) + + parser.add_argument("--train_dataset", default="train.xlsx", type=str) + parser.add_argument("--valid_dataset", default="valid.xlsx", type=str) + + parser.add_argument("--label_plan", default="4", type=str) + + args = parser.parse_args() + return args + + +def get_dataset(args): + filename_patterns = args.filename_patterns + filename_patterns = filename_patterns.split(" ") + print(filename_patterns) + + file_dir = Path(args.file_dir) + file_dir.mkdir(exist_ok=True) + + if args.label_plan == "2-voicemail": + label_map = { + "bell": "voicemail", + "white_noise": "non_voicemail", + "low_white_noise": "non_voicemail", + "high_white_noise": "non_voicemail", + # "music": "non_voicemail", + "mute": "non_voicemail", + "noise": "non_voicemail", + "noise_mute": "non_voicemail", + "voice": "non_voicemail", + "voicemail": "voicemail", + } + elif args.label_plan == "2": + label_map = { + "bell": "non_voice", + "white_noise": "non_voice", + "low_white_noise": "non_voice", + "high_white_noise": "non_voice", + "music": "non_voice", + "mute": "non_voice", + "noise": "non_voice", + "noise_mute": "non_voice", + "voice": "voice", + "voicemail": "voice", + } + elif args.label_plan == "3": + label_map = { + "bell": "voicemail", + "white_noise": "mute", + "low_white_noise": "mute", + "high_white_noise": "mute", + # "music": "music", + "mute": "mute", + "noise": "voice_or_noise", + "noise_mute": "voice_or_noise", + "voice": "voice_or_noise", + "voicemail": "voicemail", + } + elif args.label_plan == "4": + label_map = { + "bell": "voicemail", + "white_noise": "mute", + "low_white_noise": "mute", + "high_white_noise": "mute", + # "music": "music", + "mute": "mute", + "noise": "noise", + "noise_mute": "noise", + "voice": "voice", + "voicemail": "voicemail", + } + elif args.label_plan == "8": + label_map = { + "bell": "bell", + "white_noise": "white_noise", + "low_white_noise": "white_noise", + "high_white_noise": "white_noise", + "music": "music", + "mute": "mute", + "noise": "noise", + "noise_mute": "noise_mute", + "voice": "voice", + "voicemail": "voicemail", + } + else: + raise AssertionError + + result = list() + for filename_pattern in filename_patterns: + filename_list = glob(filename_pattern) + for filename in tqdm(filename_list): + filename = Path(filename) + sample_rate, signal = wavfile.read(filename.as_posix()) + if len(signal) < sample_rate * 2: + continue + + folder = filename.parts[-2] + country = filename.parts[-4] + + if folder not in label_map.keys(): + continue + + labels = label_map[folder] + + random1 = random.random() + random2 = random.random() + + result.append({ + "filename": filename, + "folder": folder, + "category": country, + "labels": labels, + "random1": random1, + "random2": random2, + "flag": "TRAIN" if random2 < 0.8 else "TEST", + }) + + df = pd.DataFrame(result) + pivot_table = pd.pivot_table(df, index=["labels"], values=["filename"], aggfunc="count") + print(pivot_table) + + df = df.sort_values(by=["random1"], ascending=False) + df.to_excel( + file_dir / "dataset.xlsx", + index=False, + # encoding="utf_8_sig" + ) + + return + + +def split_dataset(args): + """分割训练集, 测试集""" + file_dir = Path(args.file_dir) + file_dir.mkdir(exist_ok=True) + + df = pd.read_excel(file_dir / "dataset.xlsx") + + train = list() + test = list() + + for i, row in df.iterrows(): + flag = row["flag"] + if flag == "TRAIN": + train.append(row) + else: + test.append(row) + + train = pd.DataFrame(train) + train.to_excel( + args.train_dataset, + index=False, + # encoding="utf_8_sig" + ) + test = pd.DataFrame(test) + test.to_excel( + args.valid_dataset, + index=False, + # encoding="utf_8_sig" + ) + + return + + +def main(): + args = get_args() + get_dataset(args) + split_dataset(args) + return + + +if __name__ == "__main__": + main() diff --git a/examples/sound_classification_by_lstm/step_2_make_vocabulary.py b/examples/sound_classification_by_lstm/step_2_make_vocabulary.py new file mode 100644 index 0000000000000000000000000000000000000000..db355f47e75b4c931e9a163826089bf4a5a7d26b --- /dev/null +++ b/examples/sound_classification_by_lstm/step_2_make_vocabulary.py @@ -0,0 +1,50 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import argparse +import os +import sys + +pwd = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.join(pwd, "../../")) + +import pandas as pd + +from toolbox.torch.utils.data.vocabulary import Vocabulary + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--vocabulary_dir", default="vocabulary", type=str) + + parser.add_argument("--train_dataset", default="train.xlsx", type=str) + parser.add_argument("--valid_dataset", default="valid.xlsx", type=str) + + args = parser.parse_args() + return args + + +def main(): + args = get_args() + + train_dataset = pd.read_excel(args.train_dataset) + valid_dataset = pd.read_excel(args.valid_dataset) + + vocabulary = Vocabulary() + + # train + for i, row in train_dataset.iterrows(): + label = row["labels"] + vocabulary.add_token_to_namespace(label, namespace="labels") + + # valid + for i, row in valid_dataset.iterrows(): + label = row["labels"] + vocabulary.add_token_to_namespace(label, namespace="labels") + + vocabulary.save_to_files(args.vocabulary_dir) + + return + + +if __name__ == "__main__": + main() diff --git a/examples/sound_classification_by_lstm/step_3_train_model.py b/examples/sound_classification_by_lstm/step_3_train_model.py new file mode 100644 index 0000000000000000000000000000000000000000..349542cccc7ec6a3ebe3c9ea2d1beaf0b662e72e --- /dev/null +++ b/examples/sound_classification_by_lstm/step_3_train_model.py @@ -0,0 +1,367 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import argparse +from collections import defaultdict +import json +import logging +from logging.handlers import TimedRotatingFileHandler +import os +import platform +from pathlib import Path +import random +import sys +import shutil +import tempfile +from typing import List +import zipfile + +pwd = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.join(pwd, "../../")) + +import numpy as np +import torch +from torch.utils.data.dataloader import DataLoader +from tqdm import tqdm + +from toolbox.torch.modules.loss import FocalLoss, HingeLoss, HingeLinear +from toolbox.torch.training.metrics.categorical_accuracy import CategoricalAccuracy +from toolbox.torch.utils.data.vocabulary import Vocabulary +from toolbox.torch.utils.data.dataset.wave_classifier_excel_dataset import WaveClassifierExcelDataset +from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import LSTMClassifierPretrainedModel +from toolbox.torchaudio.models.lstm_audio_classifier.configuration_lstm_audio_classifier import LSTMClassifierConfig + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--vocabulary_dir", default="vocabulary", type=str) + + parser.add_argument("--train_dataset", default="train.xlsx", type=str) + parser.add_argument("--valid_dataset", default="valid.xlsx", type=str) + + parser.add_argument("--max_epochs", default=100, type=int) + + parser.add_argument("--batch_size", default=64, type=int) + parser.add_argument("--learning_rate", default=1e-3, type=float) + parser.add_argument("--num_serialized_models_to_keep", default=10, type=int) + parser.add_argument("--patience", default=5, type=int) + parser.add_argument("--serialization_dir", default="serialization_dir", type=str) + parser.add_argument("--seed", default=0, type=int) + + parser.add_argument("--config_file", default="conv2d_classifier.yaml", type=str) + parser.add_argument( + "--pretrained_model", + # default=(project_path / "trained_models/voicemail-en-sg-2-ch4-cnn.zip").as_posix(), + default="null", + type=str + ) + + args = parser.parse_args() + return args + + +def logging_config(file_dir: str): + fmt = "%(asctime)s - %(name)s - %(levelname)s %(filename)s:%(lineno)d > %(message)s" + + logging.basicConfig(format=fmt, + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.DEBUG) + file_handler = TimedRotatingFileHandler( + filename=os.path.join(file_dir, "main.log"), + encoding="utf-8", + when="D", + interval=1, + backupCount=7 + ) + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(logging.Formatter(fmt)) + logger = logging.getLogger(__name__) + logger.addHandler(file_handler) + + return logger + + +class CollateFunction(object): + def __init__(self): + pass + + def __call__(self, batch: List[dict]): + array_list = list() + label_list = list() + for sample in batch: + array = sample["waveform"] + label = sample["label"] + + l = len(array) + if l < 16000: + delta = int(16000 - l) + array = np.concatenate([array, np.zeros(shape=(delta,), dtype=np.float32)], axis=-1) + if l > 16000: + array = array[:16000] + + array_list.append(array) + label_list.append(label) + + array_list = torch.stack(array_list) + label_list = torch.stack(label_list) + return array_list, label_list + + +collate_fn = CollateFunction() + + +def main(): + args = get_args() + + serialization_dir = Path(args.serialization_dir) + serialization_dir.mkdir(parents=True, exist_ok=True) + + logger = logging_config(serialization_dir) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + logger.info("set seed: {}".format(args.seed)) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + logger.info("GPU available count: {}; device: {}".format(n_gpu, device)) + + vocabulary = Vocabulary.from_files(args.vocabulary_dir) + + # datasets + logger.info("prepare datasets") + train_dataset = WaveClassifierExcelDataset( + vocab=vocabulary, + excel_file=args.train_dataset, + category=None, + category_field="category", + label_field="labels", + expected_sample_rate=8000, + max_wave_value=32768.0, + ) + valid_dataset = WaveClassifierExcelDataset( + vocab=vocabulary, + excel_file=args.valid_dataset, + category=None, + category_field="category", + label_field="labels", + expected_sample_rate=8000, + max_wave_value=32768.0, + ) + train_data_loader = DataLoader( + dataset=train_dataset, + batch_size=args.batch_size, + shuffle=True, + # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能. + num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2, + collate_fn=collate_fn, + pin_memory=False, + # prefetch_factor=64, + ) + valid_data_loader = DataLoader( + dataset=valid_dataset, + batch_size=args.batch_size, + shuffle=True, + # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能. + num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2, + collate_fn=collate_fn, + pin_memory=False, + # prefetch_factor=64, + ) + + # models + logger.info(f"prepare models. config_file: {args.config_file}") + config = LSTMClassifierConfig.from_pretrained( + pretrained_model_name_or_path=args.config_file, + # num_labels=vocabulary.get_vocab_size(namespace="labels") + ) + if not config.cls_head_param["num_labels"] == vocabulary.get_vocab_size(namespace="labels"): + raise AssertionError("expected num labels: {} instead of {}.".format( + vocabulary.get_vocab_size(namespace="labels"), + config.cls_head_param["num_labels"], + )) + model = LSTMClassifierPretrainedModel( + config=config, + ) + + if args.pretrained_model is not None and os.path.exists(args.pretrained_model): + logger.info(f"load pretrained model state dict from: {args.pretrained_model}") + pretrained_model = Path(args.pretrained_model) + with zipfile.ZipFile(pretrained_model.as_posix(), "r") as f_zip: + out_root = Path(tempfile.gettempdir()) / "cc_audio_8" + # print(out_root.as_posix()) + if out_root.exists(): + shutil.rmtree(out_root.as_posix()) + out_root.mkdir(parents=True, exist_ok=True) + f_zip.extractall(path=out_root) + + tgt_path = out_root / pretrained_model.stem + model_pt_file = tgt_path / "model.pt" + with open(model_pt_file, "rb") as f: + state_dict = torch.load(f, map_location="cpu") + model.load_state_dict(state_dict=state_dict) + + model.to(device) + model.train() + + # optimizer + logger.info("prepare optimizer, lr_scheduler, loss_fn, categorical_accuracy") + param_optimizer = model.parameters() + optimizer = torch.optim.Adam( + param_optimizer, + lr=args.learning_rate, + ) + # lr_scheduler = torch.optim.lr_scheduler.StepLR( + # optimizer, + # step_size=2000 + # ) + lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( + optimizer, + milestones=[10000, 20000, 30000, 40000, 50000], gamma=0.5 + ) + focal_loss = FocalLoss( + num_classes=vocabulary.get_vocab_size(namespace="labels"), + reduction="mean", + ) + categorical_accuracy = CategoricalAccuracy() + + # training loop + logger.info("training") + + training_loss = 10000000000 + training_accuracy = 0. + evaluation_loss = 10000000000 + evaluation_accuracy = 0. + + model_list = list() + best_idx_epoch = None + best_accuracy = None + patience_count = 0 + + for idx_epoch in range(args.max_epochs): + categorical_accuracy.reset() + total_loss = 0. + total_examples = 0. + progress_bar = tqdm( + total=len(train_data_loader), + desc="Training; epoch: {}".format(idx_epoch), + ) + for batch in train_data_loader: + input_ids, label_ids = batch + input_ids = input_ids.to(device) + label_ids: torch.LongTensor = label_ids.to(device).long() + + logits = model.forward(input_ids) + loss = focal_loss.forward(logits, label_ids.view(-1)) + categorical_accuracy(logits, label_ids) + + total_loss += loss.item() + total_examples += input_ids.size(0) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + lr_scheduler.step() + + training_loss = total_loss / total_examples + training_loss = round(training_loss, 4) + training_accuracy = categorical_accuracy.get_metric()["accuracy"] + training_accuracy = round(training_accuracy, 4) + + progress_bar.update(1) + progress_bar.set_postfix({ + "training_loss": training_loss, + "training_accuracy": training_accuracy, + }) + + categorical_accuracy.reset() + total_loss = 0. + total_examples = 0. + progress_bar = tqdm( + total=len(valid_data_loader), + desc="Evaluation; epoch: {}".format(idx_epoch), + ) + for batch in valid_data_loader: + input_ids, label_ids = batch + input_ids = input_ids.to(device) + label_ids: torch.LongTensor = label_ids.to(device).long() + + with torch.no_grad(): + logits = model.forward(input_ids) + loss = focal_loss.forward(logits, label_ids.view(-1)) + categorical_accuracy(logits, label_ids) + + total_loss += loss.item() + total_examples += input_ids.size(0) + + evaluation_loss = total_loss / total_examples + evaluation_loss = round(evaluation_loss, 4) + evaluation_accuracy = categorical_accuracy.get_metric()["accuracy"] + evaluation_accuracy = round(evaluation_accuracy, 4) + + progress_bar.update(1) + progress_bar.set_postfix({ + "evaluation_loss": evaluation_loss, + "evaluation_accuracy": evaluation_accuracy, + }) + + # save path + epoch_dir = serialization_dir / "epoch-{}".format(idx_epoch) + epoch_dir.mkdir(parents=True, exist_ok=False) + + # save models + model.save_pretrained(epoch_dir.as_posix()) + + model_list.append(epoch_dir) + if len(model_list) >= args.num_serialized_models_to_keep: + model_to_delete: Path = model_list.pop(0) + shutil.rmtree(model_to_delete.as_posix()) + + # save metric + if best_accuracy is None: + best_idx_epoch = idx_epoch + best_accuracy = evaluation_accuracy + elif evaluation_accuracy > best_accuracy: + best_idx_epoch = idx_epoch + best_accuracy = evaluation_accuracy + else: + pass + + metrics = { + "idx_epoch": idx_epoch, + "best_idx_epoch": best_idx_epoch, + "best_accuracy": best_accuracy, + "training_loss": training_loss, + "training_accuracy": training_accuracy, + "evaluation_loss": evaluation_loss, + "evaluation_accuracy": evaluation_accuracy, + "learning_rate": optimizer.param_groups[0]['lr'], + } + metrics_filename = epoch_dir / "metrics_epoch.json" + with open(metrics_filename, "w", encoding="utf-8") as f: + json.dump(metrics, f, indent=4, ensure_ascii=False) + + # save best + best_dir = serialization_dir / "best" + if best_idx_epoch == idx_epoch: + if best_dir.exists(): + shutil.rmtree(best_dir) + shutil.copytree(epoch_dir, best_dir) + + # early stop + early_stop_flag = False + if best_idx_epoch == idx_epoch: + patience_count = 0 + else: + patience_count += 1 + if patience_count >= args.patience: + early_stop_flag = True + + # early stop + if early_stop_flag: + break + return + + +if __name__ == "__main__": + main() diff --git a/examples/sound_classification_by_lstm/yaml/lstm_classifier-4-ch64.yaml b/examples/sound_classification_by_lstm/yaml/lstm_classifier-4-ch64.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2bcc2b316813c2e31fac095a04b5f4d7e048feeb --- /dev/null +++ b/examples/sound_classification_by_lstm/yaml/lstm_classifier-4-ch64.yaml @@ -0,0 +1,27 @@ +model_name: "lstm_audio_classifier" + +mel_spectrogram_param: + sample_rate: 8000 + n_fft: 512 + win_length: 200 + hop_length: 80 + f_min: 10 + f_max: 3800 + window_fn: hamming + n_mels: 80 + +lstm_layer_param: + input_size: 80 + hidden_size: 64 + num_layers: 3 + dropout: 0.2 + pool_layer: last + +cls_head_param: + input_dim: 64 + num_layers: 1 + hidden_dims: + - 32 + activations: relu + dropout: 0.1 + num_labels: 4 diff --git a/requirements.txt b/requirements.txt index 75bdc92d636e88327da7e3ee0fbdfb7c152aaf2f..eb5016f89dd36de539cef525084797f35ed57c13 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ gradio python-dotenv numpy onnxruntime +scipy diff --git a/toolbox/torchaudio/models/lstm_audio_classifier/__init__.py b/toolbox/torchaudio/models/lstm_audio_classifier/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..81a66fc40cec5e1bad20c94ebc03002f9772eb07 --- /dev/null +++ b/toolbox/torchaudio/models/lstm_audio_classifier/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + + +if __name__ == "__main__": + pass diff --git a/toolbox/torchaudio/models/lstm_audio_classifier/configuration_lstm_audio_classifier.py b/toolbox/torchaudio/models/lstm_audio_classifier/configuration_lstm_audio_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..88fdf652e5018b86c776e36abc0c0d66bdc9f40d --- /dev/null +++ b/toolbox/torchaudio/models/lstm_audio_classifier/configuration_lstm_audio_classifier.py @@ -0,0 +1,22 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +from typing import Any, Dict, List, Tuple, Union + +from toolbox.torchaudio.configuration_utils import PretrainedConfig + + +class LSTMClassifierConfig(PretrainedConfig): + def __init__(self, + mel_spectrogram_param: dict, + lstm_layer_param: dict, + cls_head_param: dict, + **kwargs + ): + super(LSTMClassifierConfig, self).__init__(**kwargs) + self.mel_spectrogram_param = mel_spectrogram_param + self.lstm_layer_param = lstm_layer_param + self.cls_head_param = cls_head_param + + +if __name__ == "__main__": + pass diff --git a/toolbox/torchaudio/models/lstm_audio_classifier/examples/lstm_classifier.yaml b/toolbox/torchaudio/models/lstm_audio_classifier/examples/lstm_classifier.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2bcc2b316813c2e31fac095a04b5f4d7e048feeb --- /dev/null +++ b/toolbox/torchaudio/models/lstm_audio_classifier/examples/lstm_classifier.yaml @@ -0,0 +1,27 @@ +model_name: "lstm_audio_classifier" + +mel_spectrogram_param: + sample_rate: 8000 + n_fft: 512 + win_length: 200 + hop_length: 80 + f_min: 10 + f_max: 3800 + window_fn: hamming + n_mels: 80 + +lstm_layer_param: + input_size: 80 + hidden_size: 64 + num_layers: 3 + dropout: 0.2 + pool_layer: last + +cls_head_param: + input_dim: 64 + num_layers: 1 + hidden_dims: + - 32 + activations: relu + dropout: 0.1 + num_labels: 4 diff --git a/toolbox/torchaudio/models/lstm_audio_classifier/modeling_lstm_audio_classifier.py b/toolbox/torchaudio/models/lstm_audio_classifier/modeling_lstm_audio_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..e8884a4f5e0cba5e18c33a96e4f7d08afafa2aad --- /dev/null +++ b/toolbox/torchaudio/models/lstm_audio_classifier/modeling_lstm_audio_classifier.py @@ -0,0 +1,295 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import os +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torchaudio +import torch.nn as nn +from toolbox.torchaudio.configuration_utils import CONFIG_FILE, PretrainedConfig +from toolbox.torchaudio.models.lstm_audio_classifier.configuration_lstm_audio_classifier import LSTMClassifierConfig + + +MODEL_FILE = "model.pt" + + +name2activation = { + "relu": nn.ReLU, +} + + +class FeedForward(nn.Module): + def __init__(self, + input_dim: int, + num_layers: int, + hidden_dims: Union[int, List[int]], + activations: Union[str, List[str]], + dropout: Union[float, List[float]] = 0.0) -> None: + + super(FeedForward, self).__init__() + if not isinstance(hidden_dims, list): + hidden_dims = [hidden_dims] * num_layers # type: ignore + if not isinstance(activations, list): + activations = [activations] * num_layers # type: ignore + if not isinstance(dropout, list): + dropout = [dropout] * num_layers # type: ignore + if len(hidden_dims) != num_layers: + raise AssertionError("len(hidden_dims) (%d) != num_layers (%d)" % + (len(hidden_dims), num_layers)) + if len(activations) != num_layers: + raise AssertionError("len(activations) (%d) != num_layers (%d)" % + (len(activations), num_layers)) + if len(dropout) != num_layers: + raise AssertionError("len(dropout) (%d) != num_layers (%d)" % + (len(dropout), num_layers)) + self._activations = torch.nn.ModuleList([name2activation[activation]() for activation in activations]) + + input_dims = [input_dim] + hidden_dims[:-1] + linear_layers = [] + for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims): + linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim)) + self._linear_layers = torch.nn.ModuleList(linear_layers) + dropout_layers = [torch.nn.Dropout(p=value) for value in dropout] + self._dropout = torch.nn.ModuleList(dropout_layers) + self.output_dim = hidden_dims[-1] + self.input_dim = input_dim + + def get_output_dim(self): + return self.output_dim + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + output = inputs + for layer, activation, dropout in zip(self._linear_layers, self._activations, self._dropout): + output = dropout(activation(layer(output))) + return output + + +class LSTMLayer(nn.Module): + def __init__(self, + input_size: int, + hidden_size: int, + num_layers: int, + dropout: float = 0.0, + pool_layer: str = "last" + ): + super(LSTMLayer, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout if num_layers > 1 else 0.0 + # mean, last + self.pool_layer = pool_layer + + self.lstm = nn.LSTM( + input_size=self.input_size, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + dropout=self.dropout, + batch_first=True + ) + + def forward(self, inputs: torch.Tensor, h: Optional[torch.Tensor] = None, c: Optional[torch.Tensor] = None): + """ + :param inputs: shape, [b, t, f] + :param h: shape, [num_layers, b, hidden_size] + :param c: shape, [num_layers, b, hidden_size] + :return: + features: shape, [b, hidden_size] + h: shape, [num_layers, b, hidden_size] + c: shape, [num_layers, b, hidden_size] + """ + if h is None or c is None: + batch_size = inputs.size(0) + h, c = self._init_hidden(batch_size, inputs.device) + if inputs.dim() == 4: + # [b, 1, t, f] + inputs = inputs.squeeze(1) + # [b, t, f] + + # [b, t, f] + out, (h, c) = self.lstm(inputs, (h, c)) + + if self.pool_layer == "mean": + features = torch.mean(out, dim=1) + # features shape: [batch_size, hidden_size] + elif self.pool_layer == "last": + features = out[:, -1, :] + # features shape: [batch_size, hidden_size] + else: + raise ValueError("pool_layer must be mean or last") + return features, h, c + + def _init_hidden(self, batch_size: int, device: torch.device): + h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device) + c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device) + return h0, c0 + + +class WaveEncoder(nn.Module): + def __init__(self, + mel_spectrogram_param: dict, + lstm_layer_param: dict, + ): + super().__init__() + self.mel_spectrogram_param = mel_spectrogram_param + self.lstm_layer_param = lstm_layer_param + + self.wave_to_mel_spectrogram = torch.nn.Sequential( + torchaudio.transforms.MelSpectrogram( + sample_rate=mel_spectrogram_param["sample_rate"], + n_fft=mel_spectrogram_param["n_fft"], + win_length=mel_spectrogram_param["win_length"], + hop_length=mel_spectrogram_param["hop_length"], + f_min=mel_spectrogram_param["f_min"], + f_max=mel_spectrogram_param["f_max"], + window_fn=torch.hamming_window if mel_spectrogram_param["window_fn"] == "hamming" else torch.hann_window, + n_mels=mel_spectrogram_param["n_mels"], + ), + ) + + self.lstm_layer = LSTMLayer( + input_size=lstm_layer_param["input_size"], + hidden_size=lstm_layer_param["hidden_size"], + num_layers=lstm_layer_param["num_layers"], + dropout=lstm_layer_param["dropout"], + pool_layer=lstm_layer_param["pool_layer"], + ) + + def forward(self, inputs: torch.Tensor): + # x: [batch_size, spec_dim, seq_length] + x = inputs + + with torch.no_grad(): + # shape = [batch_size, spec_dim, seq_length] + x = self.wave_to_mel_spectrogram(x) + 1e-6 + x = x.log() + x = x - torch.mean(x, dim=-1, keepdim=True) + + x = x.transpose(1, 2) + + features = self.lstm_layer.forward(x) + # features: [batch_size, seq_length, spec_dim] + return features + + +class ClsHead(nn.Module): + def __init__(self, + input_dim: int, + num_layers: int, + hidden_dims: Union[int, List[int]], + activations: Union[str, List[str]], + num_labels: int, + dropout: Union[float, List[float]] = 0.0 + ): + super(ClsHead, self).__init__() + + self.feedforward = FeedForward( + input_dim=input_dim, + num_layers=num_layers, + hidden_dims=hidden_dims, + activations=activations, + dropout=dropout, + ) + + self.output_project_layer = nn.Linear(self.feedforward.get_output_dim(), num_labels) + + def forward(self, inputs: torch.Tensor): + # inputs: [batch_size, seq_length, spec_dim] + x = self.feedforward(inputs) + + # x: [batch_size, spec_dim] + x = torch.mean(x, dim=1) + + # logits: [batch_size, num_labels] + logits = self.output_project_layer.forward(x) + return logits + + +class LSTMClassifier(nn.Module): + def __init__(self, + wave_encoder: WaveEncoder, + cls_head: ClsHead + ): + super(LSTMClassifier, self).__init__() + self.wave_encoder = wave_encoder + self.cls_head = cls_head + + def forward(self, + inputs: torch.Tensor, + h: Optional[torch.Tensor] = None, + c: Optional[torch.Tensor] = None, + ): + features, h, c = self.wave_encoder(inputs, h, c) + logits = self.cls_head(features) + # logits shape: [batch_size, num_classes] + return logits, h, c + + +class LSTMClassifierPretrainedModel(LSTMClassifier): + def __init__(self, + config: LSTMClassifierConfig, + ): + super(LSTMClassifierPretrainedModel, self).__init__( + wave_encoder=WaveEncoder( + mel_spectrogram_param=config.mel_spectrogram_param, + lstm_layer_param=config.lstm_layer_param, + + ), + cls_head=ClsHead( + input_dim=config.cls_head_param["input_dim"], + num_layers=config.cls_head_param["num_layers"], + hidden_dims=config.cls_head_param["hidden_dims"], + activations=config.cls_head_param["activations"], + num_labels=config.cls_head_param["num_labels"], + dropout=config.cls_head_param["dropout"], + ) + ) + self.config = config + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): + config = LSTMClassifierConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + + model = cls(config) + + if os.path.isdir(pretrained_model_name_or_path): + ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE) + else: + ckpt_file = pretrained_model_name_or_path + + with open(ckpt_file, "rb") as f: + state_dict = torch.load(f, map_location="cpu") + model.load_state_dict(state_dict, strict=True) + return model + + def save_pretrained(self, + save_directory: Union[str, os.PathLike], + state_dict: Optional[dict] = None, + ): + + model = self + + if state_dict is None: + state_dict = model.state_dict() + + os.makedirs(save_directory, exist_ok=True) + + # save state dict + model_file = os.path.join(save_directory, MODEL_FILE) + torch.save(state_dict, model_file) + + # save config + config_file = os.path.join(save_directory, CONFIG_FILE) + self.config.to_yaml_file(config_file) + return save_directory + + +def main(): + config = LSTMClassifierConfig.from_pretrained("examples/lstm_classifier.yaml") + model = LSTMClassifierPretrainedModel(config) + print(model) + return + + +if __name__ == "__main__": + main()