{"id":"https://openalex.org/W4401252969","doi":"https://doi.org/10.1007/s10462-024-10869-1","title":"Cascaded cross-modal transformer for audio\u2013textual classification","display_name":"Cascaded cross-modal transformer for audio\u2013textual classification","publication_year":2024,"publication_date":"2024-08-02","ids":{"openalex":"https://openalex.org/W4401252969","doi":"https://doi.org/10.1007/s10462-024-10869-1"},"language":"en","primary_location":{"id":"doi:10.1007/s10462-024-10869-1","is_oa":true,"landing_page_url":"http://dx.doi.org/10.1007/s10462-024-10869-1","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10462-024-10869-1.pdf","source":{"id":"https://openalex.org/S122814990","display_name":"Artificial Intelligence Review","issn_l":"0269-2821","issn":["0269-2821","1573-7462"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Artificial Intelligence Review","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s10462-024-10869-1.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083932459","display_name":"Nicolae-C\u0103t\u0103lin Ristea","orcid":"https://orcid.org/0000-0002-7880-9307"},"institutions":[{"id":"https://openalex.org/I61641377","display_name":"Universitatea Na\u021bional\u0103 de \u0218tiin\u021b\u0103 \u0219i Tehnologie Politehnica Bucure\u0219ti","ror":"https://ror.org/0558j5q12","country_code":"RO","type":"education","lineage":["https://openalex.org/I61641377"]}],"countries":["RO"],"is_corresponding":false,"raw_author_name":"Nicolae-C\u0103t\u0103lin Ristea","raw_affiliation_strings":["National University of Science and Technology Politehnica Bucharest, Bucharest, Romania"],"affiliations":[{"raw_affiliation_string":"National University of Science and Technology Politehnica Bucharest, Bucharest, Romania","institution_ids":["https://openalex.org/I61641377"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038937968","display_name":"Andrei Anghel","orcid":"https://orcid.org/0000-0003-3875-3238"},"institutions":[{"id":"https://openalex.org/I61641377","display_name":"Universitatea Na\u021bional\u0103 de \u0218tiin\u021b\u0103 \u0219i Tehnologie Politehnica Bucure\u0219ti","ror":"https://ror.org/0558j5q12","country_code":"RO","type":"education","lineage":["https://openalex.org/I61641377"]}],"countries":["RO"],"is_corresponding":false,"raw_author_name":"Andrei Anghel","raw_affiliation_strings":["National University of Science and Technology Politehnica Bucharest, Bucharest, Romania"],"affiliations":[{"raw_affiliation_string":"National University of Science and Technology Politehnica Bucharest, Bucharest, Romania","institution_ids":["https://openalex.org/I61641377"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5081017623","display_name":"Radu Tudor Ionescu","orcid":"https://orcid.org/0000-0002-9301-1950"},"institutions":[{"id":"https://openalex.org/I141595442","display_name":"University of Bucharest","ror":"https://ror.org/02x2v6p15","country_code":"RO","type":"education","lineage":["https://openalex.org/I141595442"]}],"countries":["RO"],"is_corresponding":true,"raw_author_name":"Radu Tudor Ionescu","raw_affiliation_strings":["University of Bucharest, Bucharest, Romania"],"affiliations":[{"raw_affiliation_string":"University of Bucharest, Bucharest, Romania","institution_ids":["https://openalex.org/I141595442"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5081017623"],"corresponding_institution_ids":["https://openalex.org/I141595442"],"apc_list":{"value":2490,"currency":"EUR","value_usd":3090},"apc_paid":{"value":2490,"currency":"EUR","value_usd":3090},"fwci":0.7501,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.68711854,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"57","issue":"9","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8714554309844971},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7111682891845703},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6551244258880615},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5399029850959778},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.4873705506324768},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.48602133989334106},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.45905598998069763},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.443144291639328},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4247584640979767},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.09938037395477295}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8714554309844971},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7111682891845703},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6551244258880615},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5399029850959778},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.4873705506324768},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.48602133989334106},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45905598998069763},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.443144291639328},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4247584640979767},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.09938037395477295},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s10462-024-10869-1","is_oa":true,"landing_page_url":"http://dx.doi.org/10.1007/s10462-024-10869-1","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10462-024-10869-1.pdf","source":{"id":"https://openalex.org/S122814990","display_name":"Artificial Intelligence Review","issn_l":"0269-2821","issn":["0269-2821","1573-7462"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Artificial Intelligence Review","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s10462-024-10869-1","is_oa":true,"landing_page_url":"http://dx.doi.org/10.1007/s10462-024-10869-1","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10462-024-10869-1.pdf","source":{"id":"https://openalex.org/S122814990","display_name":"Artificial Intelligence Review","issn_l":"0269-2821","issn":["0269-2821","1573-7462"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Artificial Intelligence Review","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7300000190734863}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4401252969.pdf"},"referenced_works_count":53,"referenced_works":["https://openalex.org/W2064675550","https://openalex.org/W2137424107","https://openalex.org/W2194775991","https://openalex.org/W2593116425","https://openalex.org/W2767290858","https://openalex.org/W2931364255","https://openalex.org/W2962770129","https://openalex.org/W2986154550","https://openalex.org/W3011727199","https://openalex.org/W3016138882","https://openalex.org/W3037572520","https://openalex.org/W3083173864","https://openalex.org/W3094550259","https://openalex.org/W3094595351","https://openalex.org/W3097018422","https://openalex.org/W3097158535","https://openalex.org/W3102046836","https://openalex.org/W3103187652","https://openalex.org/W3125775899","https://openalex.org/W3156333129","https://openalex.org/W3171536535","https://openalex.org/W3185593275","https://openalex.org/W3186192207","https://openalex.org/W3196974791","https://openalex.org/W3204266507","https://openalex.org/W3206996142","https://openalex.org/W3211189476","https://openalex.org/W4206042814","https://openalex.org/W4206994745","https://openalex.org/W4212926655","https://openalex.org/W4220759704","https://openalex.org/W4221167764","https://openalex.org/W4225319488","https://openalex.org/W4226315417","https://openalex.org/W4285114562","https://openalex.org/W4296068596","https://openalex.org/W4307640249","https://openalex.org/W4312384316","https://openalex.org/W4321231176","https://openalex.org/W4321368636","https://openalex.org/W4321497840","https://openalex.org/W4322736091","https://openalex.org/W4376226279","https://openalex.org/W4386047824","https://openalex.org/W4387724935","https://openalex.org/W4387968017","https://openalex.org/W4387968410","https://openalex.org/W4387968621","https://openalex.org/W4390874283","https://openalex.org/W6600424091","https://openalex.org/W6604963970","https://openalex.org/W6606191167","https://openalex.org/W6818723395"],"related_works":["https://openalex.org/W4288365749","https://openalex.org/W2936497627","https://openalex.org/W3013624417","https://openalex.org/W4287826556","https://openalex.org/W3098382480","https://openalex.org/W4287598411","https://openalex.org/W3094871513","https://openalex.org/W3100913109","https://openalex.org/W3198458223","https://openalex.org/W3126642501"],"abstract_inverted_index":{"Abstract":[0],"Speech":[1,167],"classification":[2,26],"tasks":[3],"often":[4],"require":[5],"powerful":[6],"language":[7],"understanding":[8],"models":[9,45],"to":[10,30],"grasp":[11],"useful":[12],"features,":[13],"which":[14],"becomes":[15],"problematic":[16],"when":[17],"limited":[18],"training":[19],"data":[20,66,173],"is":[21,90,185],"available.":[22],"To":[23],"attain":[24],"superior":[25],"performance,":[27],"we":[28,69,161],"propose":[29],"harness":[31],"the":[32,48,107,119,128,132,142,166],"inherent":[33],"value":[34],"of":[35,131,150],"multimodal":[36],"representations":[37],"by":[38,118],"transcribing":[39],"speech":[40,43],"using":[41],"automatic":[42],"recognition":[44],"and":[46,152,156,170],"translating":[47],"transcripts":[49],"into":[50],"different":[51],"languages":[52],"via":[53,81],"pretrained":[54],"translation":[55],"models.":[56],"We":[57,123],"thus":[58],"obtain":[59],"an":[60,146],"audio\u2013textual":[61],"(multimodal)":[62],"representation":[63],"for":[64,154,188],"each":[65],"sample.":[67],"Subsequently,":[68],"combine":[70],"language-specific":[71],"Bidirectional":[72],"Encoder":[73],"Representations":[74],"from":[75,103],"Transformers":[76],"with":[77,113],"Wav2Vec2.0":[78],"audio":[79],"features":[80,102,112,115],"a":[82],"novel":[83],"cascaded":[84,94],"cross-modal":[85],"transformer":[86,95,121],"(CCMT).":[87],"Our":[88,183],"model":[89],"based":[91],"on":[92,165,180],"two":[93],"blocks.":[96],"The":[97],"first":[98,120],"one":[99,109],"combines":[100,110],"text-specific":[101],"distinct":[104],"languages,":[105],"while":[106],"second":[108],"acoustic":[111],"multilingual":[114],"previously":[116],"learned":[117],"block.":[122],"employed":[124],"our":[125,163],"system":[126],"in":[127],"Requests":[129],"Sub-Challenge":[130],"ACM":[133],"Multimedia":[134],"2023":[135],"Computational":[136],"Paralinguistics":[137],"Challenge.":[138],"CCMT":[139],"was":[140],"declared":[141],"winning":[143],"solution,":[144],"obtaining":[145],"unweighted":[147],"average":[148],"recall":[149],"65.41%":[151],"85.87%":[153],"complaint":[155],"request":[157],"detection,":[158],"respectively.":[159],"Moreover,":[160],"applied":[162],"framework":[164],"Commands":[168],"v2":[169],"HVB":[171],"dialog":[172],"sets,":[174],"surpassing":[175],"previous":[176],"studies":[177],"reporting":[178],"results":[179],"these":[181],"benchmarks.":[182],"code":[184],"freely":[186],"available":[187],"download":[189],"at:":[190],"https://github.com/ristea/ccmt":[191],".":[192]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2025-12-19T19:40:27.379048","created_date":"2025-10-10T00:00:00"}
