{"id":"https://openalex.org/W4383340360","doi":"https://doi.org/10.1109/taslp.2023.3293015","title":"ACTUAL: Audio Captioning With Caption Feature Space Regularization","display_name":"ACTUAL: Audio Captioning With Caption Feature Space Regularization","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4383340360","doi":"https://doi.org/10.1109/taslp.2023.3293015"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2023.3293015","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3293015","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103038399","display_name":"Yiming Zhang","orcid":"https://orcid.org/0000-0003-1172-6846"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yiming Zhang","raw_affiliation_strings":["Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102842246","display_name":"Hong Yu","orcid":"https://orcid.org/0009-0000-5992-2892"},"institutions":[{"id":"https://openalex.org/I182707071","display_name":"Ludong University","ror":"https://ror.org/028h95t32","country_code":"CN","type":"education","lineage":["https://openalex.org/I182707071"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hong Yu","raw_affiliation_strings":["Department of Artificial Intelligence, School of Information and Electrical Engineering, Ludong University, Yantai, Shandong, China"],"affiliations":[{"raw_affiliation_string":"Department of Artificial Intelligence, School of Information and Electrical Engineering, Ludong University, Yantai, Shandong, China","institution_ids":["https://openalex.org/I182707071"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011876517","display_name":"Ruoyi Du","orcid":"https://orcid.org/0000-0001-8372-5637"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruoyi Du","raw_affiliation_strings":["Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090108098","display_name":"Zheng\u2010Hua Tan","orcid":"https://orcid.org/0000-0001-6856-8928"},"institutions":[{"id":"https://openalex.org/I891191580","display_name":"Aalborg University","ror":"https://ror.org/04m5j1k67","country_code":"DK","type":"education","lineage":["https://openalex.org/I891191580"]}],"countries":["DK"],"is_corresponding":false,"raw_author_name":"Zheng-Hua Tan","raw_affiliation_strings":["Department of Electronic Systems, Aalborg University, Aalborg, Denmark"],"affiliations":[{"raw_affiliation_string":"Department of Electronic Systems, Aalborg University, Aalborg, Denmark","institution_ids":["https://openalex.org/I891191580"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100676721","display_name":"Wenwu Wang","orcid":"https://orcid.org/0000-0002-8393-5703"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wenwu Wang","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing, University of Surrey, Guildford, U.K","Centre for Vision, Speech and Signal Processing, University of Surrey, Guildford, United Kingdom"],"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing, University of Surrey, Guildford, U.K","institution_ids":["https://openalex.org/I28290843"]},{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing, University of Surrey, Guildford, United Kingdom","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039812471","display_name":"Zhanyu Ma","orcid":"https://orcid.org/0000-0003-2950-2488"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhanyu Ma","raw_affiliation_strings":["Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5056136429","display_name":"Yuan Dong","orcid":"https://orcid.org/0009-0004-8650-1603"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuan Dong","raw_affiliation_strings":["Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5103038399"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":2.2196,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.88713374,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"31","issue":null,"first_page":"2643","last_page":"2657"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9473357796669006},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6533403396606445},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.6162614822387695},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5771772265434265},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.4946112334728241},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.44828781485557556},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3455321788787842},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.1126248836517334},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.09699937701225281}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9473357796669006},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6533403396606445},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.6162614822387695},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5771772265434265},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.4946112334728241},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.44828781485557556},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3455321788787842},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.1126248836517334},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.09699937701225281},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/taslp.2023.3293015","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3293015","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:pure.atira.dk:publications/a6911138-34f8-4fb5-ba87-12aa3549aabb","is_oa":false,"landing_page_url":"https://vbn.aau.dk/da/publications/a6911138-34f8-4fb5-ba87-12aa3549aabb","pdf_url":null,"source":{"id":"https://openalex.org/S4306401731","display_name":"VBN Forskningsportal (Aalborg Universitet)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I891191580","host_organization_name":"Aalborg University","host_organization_lineage":["https://openalex.org/I891191580"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Zhang, Y, Yu, H, Du, R, Tan, Z H, Wang, W, Ma, Z & Dong, Y 2023, 'ACTUAL : Audio Captioning With Caption Feature Space Regularization', IEEE/ACM Transactions on Audio Speech and Language Processing, vol. 31, pp. 2643-2657. https://doi.org/10.1109/TASLP.2023.3293015","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G172479822","display_name":null,"funder_award_id":"62225601","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3848188706","display_name":null,"funder_award_id":"U19B2036","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7697810044","display_name":null,"funder_award_id":"Z200002","funder_id":"https://openalex.org/F4320322919","funder_display_name":"Natural Science Foundation of Beijing Municipality"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322919","display_name":"Natural Science Foundation of Beijing Municipality","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":51,"referenced_works":["https://openalex.org/W648786980","https://openalex.org/W1614298861","https://openalex.org/W2101105183","https://openalex.org/W2123301721","https://openalex.org/W2133564696","https://openalex.org/W2154652894","https://openalex.org/W2183341477","https://openalex.org/W2250539671","https://openalex.org/W2506483933","https://openalex.org/W2584424117","https://openalex.org/W2916103538","https://openalex.org/W2936774411","https://openalex.org/W2949376505","https://openalex.org/W2962788625","https://openalex.org/W2963341956","https://openalex.org/W2964199361","https://openalex.org/W2964213897","https://openalex.org/W2982554818","https://openalex.org/W3015591594","https://openalex.org/W3097791920","https://openalex.org/W3099807503","https://openalex.org/W3103022576","https://openalex.org/W3122335742","https://openalex.org/W3135656708","https://openalex.org/W3160577380","https://openalex.org/W3166396011","https://openalex.org/W3186781156","https://openalex.org/W3187963534","https://openalex.org/W3205708381","https://openalex.org/W3205860970","https://openalex.org/W3207373632","https://openalex.org/W4221153784","https://openalex.org/W4224926581","https://openalex.org/W4280567182","https://openalex.org/W4297841258","https://openalex.org/W4309795604","https://openalex.org/W4372340819","https://openalex.org/W4385245566","https://openalex.org/W4385822505","https://openalex.org/W6621543089","https://openalex.org/W6636510571","https://openalex.org/W6678262379","https://openalex.org/W6679434410","https://openalex.org/W6682631176","https://openalex.org/W6733093781","https://openalex.org/W6739901393","https://openalex.org/W6791353385","https://openalex.org/W6799303324","https://openalex.org/W6802998326","https://openalex.org/W6846221438","https://openalex.org/W6847068019"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W2963177403","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4283207562","https://openalex.org/W2330246314","https://openalex.org/W2949522393","https://openalex.org/W4289422896"],"abstract_inverted_index":{"Audio":[0],"captioning":[1,64,118],"aims":[2],"at":[3,81,155,237],"describing":[4],"the":[5,15,24,38,52,54,62,74,96,102,105,137,151,156,164,167,178,181,194,197,211,214,218],"content":[6],"of":[7,17,104,180,196,213],"audio":[8,18,26,40,63,79,117,157],"clips":[9],"with":[10,48,124],"human":[11],"language.":[12],"Due":[13],"to":[14,60,92,144,176,192,229],"ambiguity":[16],"content,":[19],"different":[20],"people":[21],"may":[22,42],"perceive":[23],"same":[25,39],"clip":[27,41,80],"differently,":[28],"resulting":[29],"in":[30,136,163,183],"caption":[31,69,206],"disparities":[32],"(":[33],"<italic":[34],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[35,239],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">i.e.</i>":[36],",":[37],"be":[43],"described":[44],"by":[45,114],"several":[46],"captions":[47,154],"diverse":[49],"semantics).":[50],"In":[51,107],"literature,":[53],"one-to-many":[55],"strategy":[56],"is":[57,70,171,234],"often":[58],"employed":[59],"train":[61],"models,":[65],"where":[66,150],"a":[67,131,146,184],"related":[68],"randomly":[71],"selected":[72],"as":[73,173],"optimization":[75,97,179],"target":[76],"for":[77],"each":[78,82],"training":[83,133],"iteration.":[84],"However,":[85],"we":[86,110,140],"observe":[87],"that":[88,204],"this":[89,108,112],"can":[90,208],"lead":[91],"significant":[93],"variations":[94],"during":[95],"process":[98],"and":[99,161,217],"adversely":[100],"affect":[101],"performance":[103,212,224],"model.":[106],"paper,":[109],"address":[111],"issue":[113],"proposing":[115],"an":[116],"method,":[119],"named":[120],"ACTUAL":[121,129,199,220],"(Audio":[122],"Captioning":[123],"capTion":[125],"featUre":[126],"spAce":[127],"reguLarization).":[128],"involves":[130],"two-stage":[132],"process:":[134],"(i)":[135],"first":[138],"stage,":[139,166],"use":[141],"contrastive":[142],"learning":[143],"construct":[145],"proxy":[147,168,205],"feature":[148,169],"space":[149,170],"similarities":[152],"between":[153],"level":[158],"are":[159],"explored,":[160],"(ii)":[162],"second":[165],"utilized":[172],"additional":[174],"supervision":[175],"improve":[177,210],"model":[182,216],"more":[185],"stable":[186],"direction.":[187],"We":[188],"conduct":[189],"extensive":[190],"experiments":[191],"demonstrate":[193],"effectiveness":[195],"proposed":[198,219],"method.":[200],"The":[201,232],"results":[202],"show":[203],"embedding":[207],"significantly":[209],"baseline":[215],"method":[221],"offers":[222],"competitive":[223],"on":[225],"two":[226],"datasets":[227],"compared":[228],"state-of-the-art":[230],"methods.":[231],"code":[233],"publicly":[235],"available":[236],"<uri":[238],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">https://github.com/PRIS-CV/Caption-Feature-Space-Regularization</uri>":[240],".":[241]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":3}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
