{"id":"https://openalex.org/W4410770984","doi":"https://doi.org/10.1109/icasspw65056.2025.11011270","title":"MACE: Leveraging Audio for Evaluating Audio Captioning Systems","display_name":"MACE: Leveraging Audio for Evaluating Audio Captioning Systems","publication_year":2025,"publication_date":"2025-04-06","ids":{"openalex":"https://openalex.org/W4410770984","doi":"https://doi.org/10.1109/icasspw65056.2025.11011270"},"language":"en","primary_location":{"id":"doi:10.1109/icasspw65056.2025.11011270","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icasspw65056.2025.11011270","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5044562674","display_name":"Sudhir Dixit","orcid":"https://orcid.org/0000-0001-9124-7110"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Satvik Dixit","raw_affiliation_strings":["Carnegie Mellon University"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017946811","display_name":"Soham Deshmukh","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Soham Deshmukh","raw_affiliation_strings":["Carnegie Mellon University"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5113017615","display_name":"Bhiksha Raj","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bhiksha Raj","raw_affiliation_strings":["Carnegie Mellon University"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5044562674"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15672269,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9897000193595886,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9897000193595886,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.987500011920929,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9537000060081482,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7357289791107178},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.7120959162712097},{"id":"https://openalex.org/keywords/digital-audio-broadcasting","display_name":"Digital audio broadcasting","score":0.6191225647926331},{"id":"https://openalex.org/keywords/mace","display_name":"Mace","score":0.607673168182373},{"id":"https://openalex.org/keywords/audio-analyzer","display_name":"Audio analyzer","score":0.452620267868042},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.40233340859413147},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.37181398272514343},{"id":"https://openalex.org/keywords/digital-audio","display_name":"Digital audio","score":0.3669213056564331},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.32301899790763855},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.24136468768119812},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.21253320574760437},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.15381574630737305}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7357289791107178},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.7120959162712097},{"id":"https://openalex.org/C2779106878","wikidata":"https://www.wikidata.org/wiki/Q1257510","display_name":"Digital audio broadcasting","level":2,"score":0.6191225647926331},{"id":"https://openalex.org/C2780739214","wikidata":"https://www.wikidata.org/wiki/Q6723023","display_name":"Mace","level":4,"score":0.607673168182373},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.452620267868042},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.40233340859413147},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.37181398272514343},{"id":"https://openalex.org/C87687168","wikidata":"https://www.wikidata.org/wiki/Q173114","display_name":"Digital audio","level":4,"score":0.3669213056564331},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.32301899790763855},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.24136468768119812},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.21253320574760437},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.15381574630737305},{"id":"https://openalex.org/C118552586","wikidata":"https://www.wikidata.org/wiki/Q7867","display_name":"Psychiatry","level":1,"score":0.0},{"id":"https://openalex.org/C45393284","wikidata":"https://www.wikidata.org/wiki/Q191012","display_name":"Conventional PCI","level":3,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C500558357","wikidata":"https://www.wikidata.org/wiki/Q12152","display_name":"Myocardial infarction","level":2,"score":0.0},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icasspw65056.2025.11011270","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icasspw65056.2025.11011270","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1956340063","https://openalex.org/W2101105183","https://openalex.org/W2103235956","https://openalex.org/W2506483933","https://openalex.org/W2945761034","https://openalex.org/W2949376505","https://openalex.org/W2970641574","https://openalex.org/W3015591594","https://openalex.org/W3094550259","https://openalex.org/W3205860970","https://openalex.org/W4226442948","https://openalex.org/W4280567182","https://openalex.org/W4315701203","https://openalex.org/W4372260310","https://openalex.org/W4372266552","https://openalex.org/W4388118619","https://openalex.org/W4392903479","https://openalex.org/W4392903801","https://openalex.org/W4392909554","https://openalex.org/W4400033239","https://openalex.org/W4402112400","https://openalex.org/W4402670888","https://openalex.org/W4403780823","https://openalex.org/W6678262379","https://openalex.org/W6682631176","https://openalex.org/W6750883802","https://openalex.org/W6761205521","https://openalex.org/W6776094423","https://openalex.org/W6778883912","https://openalex.org/W6846793904","https://openalex.org/W6852818750","https://openalex.org/W6860041859","https://openalex.org/W6873290278","https://openalex.org/W6873387908"],"related_works":["https://openalex.org/W2130109774","https://openalex.org/W2355917813","https://openalex.org/W1975359510","https://openalex.org/W2797066576","https://openalex.org/W3110605476","https://openalex.org/W2363106653","https://openalex.org/W1496727373","https://openalex.org/W2494591065","https://openalex.org/W2020952589","https://openalex.org/W2319340652"],"abstract_inverted_index":{"The":[0,170],"Automated":[1],"Audio":[2],"Captioning":[3],"(AAC)":[4],"task":[5],"aims":[6],"to":[7,68,132],"describe":[8],"an":[9],"audio":[10,25,34,73,91,97,102,105,166],"signal":[11,30,74],"using":[12],"natural":[13,41],"language.":[14],"To":[15],"evaluate":[16],"machine-generated":[17],"captions,":[18],"the":[19,72,146,150,161,165],"metrics":[20,44,51,63,163],"should":[21],"take":[22],"into":[23],"account":[24],"events,":[26],"acoustic":[27],"scenes,":[28],"paralinguistics,":[29],"characteristics,":[31],"and":[32,47,55,92,110,113,140,152],"other":[33],"information.":[35],"Traditional":[36],"AAC":[37],"evaluation":[38,168],"relies":[39],"on":[40,149,164],"language":[42],"generation":[43],"like":[45],"ROUGE":[46],"BLEU,":[48],"image":[49],"captioning":[50,167],"such":[52],"as":[53,106,108],"SPICE":[54],"CIDEr,":[56],"or":[57],"Sentence-BERT":[58],"embedding":[59],"similarity.":[60],"However,":[61],"these":[62],"only":[64],"compare":[65],"generated":[66],"captions":[67,94,112],"human":[69,128],"references,":[70],"overlooking":[71],"itself.":[75],"In":[76],"this":[77],"work,":[78],"we":[79],"propose":[80],"MACE":[81,100,136],"(Multimodal":[82],"Audio-Caption":[83],"Evaluation),":[84],"a":[85,117,138],"novel":[86],"metric":[87,148,171],"that":[88],"integrates":[89],"both":[90],"reference":[93,111],"for":[95],"comprehensive":[96],"caption":[98],"evaluation.":[99],"incorporates":[101],"information":[103],"from":[104],"well":[107],"predicted":[109],"weights":[114],"it":[115,157],"with":[116],"fluency":[118],"penalty.":[119],"Our":[120],"experiments":[121],"demonstrate":[122],"MACE\u2019s":[123],"superior":[124],"performance":[125],"in":[126],"predicting":[127],"quality":[129],"judgments":[130],"compared":[131],"traditional":[133],"metrics.":[134],"Specifically,":[135],"achieves":[137],"3.28%":[139],"4.36%":[141],"relative":[142],"accuracy":[143],"improvement":[144],"over":[145],"FENSE":[147],"AudioCaps-Eval":[151],"Clotho-Eval":[153],"datasets":[154],"respectively.":[155],"Moreover,":[156],"significantly":[158],"outperforms":[159],"all":[160],"previous":[162],"task.":[169],"is":[172],"opensourced":[173],"at":[174],"https://github.com/satvik-dixit/mace.":[175]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
