{"id":"https://openalex.org/W7123354503","doi":"https://doi.org/10.1109/tmm.2026.3651023","title":"Soundscape Captioning Using Sound Affective Quality Network and Large Language Model","display_name":"Soundscape Captioning Using Sound Affective Quality Network and Large Language Model","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7123354503","doi":"https://doi.org/10.1109/tmm.2026.3651023"},"language":null,"primary_location":{"id":"doi:10.1109/tmm.2026.3651023","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2026.3651023","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063248625","display_name":"Yuanbo Hou","orcid":"https://orcid.org/0000-0001-8469-5740"},"institutions":[{"id":"https://openalex.org/I32597200","display_name":"Ghent University","ror":"https://ror.org/00cv9y106","country_code":"BE","type":"education","lineage":["https://openalex.org/I32597200"]}],"countries":["BE"],"is_corresponding":true,"raw_author_name":"Yuanbo Hou","raw_affiliation_strings":["Department of Information Technology, WAVES Research Group, Ghent University, Gent, Belgium"],"affiliations":[{"raw_affiliation_string":"Department of Information Technology, WAVES Research Group, Ghent University, Gent, Belgium","institution_ids":["https://openalex.org/I32597200"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110681968","display_name":"Qiaoqiao Ren","orcid":null},"institutions":[{"id":"https://openalex.org/I32597200","display_name":"Ghent University","ror":"https://ror.org/00cv9y106","country_code":"BE","type":"education","lineage":["https://openalex.org/I32597200"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Qiaoqiao Ren","raw_affiliation_strings":["AIRO-IDLab, Ghent University-Imec, Gent, Belgium"],"affiliations":[{"raw_affiliation_string":"AIRO-IDLab, Ghent University-Imec, Gent, Belgium","institution_ids":["https://openalex.org/I32597200"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071923410","display_name":"Andrew D. Mitchell","orcid":"https://orcid.org/0000-0001-8399-8563"},"institutions":[{"id":"https://openalex.org/I45129253","display_name":"University College London","ror":"https://ror.org/02jx3x895","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I45129253"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Andrew Mitchell","raw_affiliation_strings":["University College London, London, U.K"],"affiliations":[{"raw_affiliation_string":"University College London, London, U.K","institution_ids":["https://openalex.org/I45129253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108818675","display_name":"W. Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wenwu Wang","raw_affiliation_strings":["CVSSP, University of Surrey, Guildford, U.K"],"affiliations":[{"raw_affiliation_string":"CVSSP, University of Surrey, Guildford, U.K","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122903624","display_name":"Jian Kang","orcid":null},"institutions":[{"id":"https://openalex.org/I45129253","display_name":"University College London","ror":"https://ror.org/02jx3x895","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I45129253"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Jian Kang","raw_affiliation_strings":["University College London, London, U.K"],"affiliations":[{"raw_affiliation_string":"University College London, London, U.K","institution_ids":["https://openalex.org/I45129253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035627933","display_name":"Tony Belpaeme","orcid":"https://orcid.org/0000-0001-5207-7745"},"institutions":[{"id":"https://openalex.org/I32597200","display_name":"Ghent University","ror":"https://ror.org/00cv9y106","country_code":"BE","type":"education","lineage":["https://openalex.org/I32597200"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Tony Belpaeme","raw_affiliation_strings":["AIRO-IDLab, Ghent University-Imec, Gent, Belgium"],"affiliations":[{"raw_affiliation_string":"AIRO-IDLab, Ghent University-Imec, Gent, Belgium","institution_ids":["https://openalex.org/I32597200"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5121276211","display_name":"Dick Botteldooren","orcid":null},"institutions":[{"id":"https://openalex.org/I32597200","display_name":"Ghent University","ror":"https://ror.org/00cv9y106","country_code":"BE","type":"education","lineage":["https://openalex.org/I32597200"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Dick Botteldooren","raw_affiliation_strings":["Department of Information Technology, WAVES Research Group, Ghent University, Gent, Belgium"],"affiliations":[{"raw_affiliation_string":"Department of Information Technology, WAVES Research Group, Ghent University, Gent, Belgium","institution_ids":["https://openalex.org/I32597200"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5063248625"],"corresponding_institution_ids":["https://openalex.org/I32597200"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.09772509,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"28","issue":null,"first_page":"2186","last_page":"2200"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11692","display_name":"Noise Effects and Management","score":0.2919999957084656,"subfield":{"id":"https://openalex.org/subfields/3616","display_name":"Speech and Hearing"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11692","display_name":"Noise Effects and Management","score":0.2919999957084656,"subfield":{"id":"https://openalex.org/subfields/3616","display_name":"Speech and Hearing"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.24040000140666962,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.16680000722408295,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/soundscape","display_name":"Soundscape","score":0.9639999866485596},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.666100025177002},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5054000020027161},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.4884999990463257},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4124999940395355},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.41130000352859497},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3968999981880188},{"id":"https://openalex.org/keywords/active-listening","display_name":"Active listening","score":0.3781999945640564},{"id":"https://openalex.org/keywords/crowdsourcing","display_name":"Crowdsourcing","score":0.3626999855041504}],"concepts":[{"id":"https://openalex.org/C142795923","wikidata":"https://www.wikidata.org/wiki/Q1358257","display_name":"Soundscape","level":3,"score":0.9639999866485596},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.786899983882904},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.666100025177002},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5054000020027161},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.4884999990463257},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.48660001158714294},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4578000009059906},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42340001463890076},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4124999940395355},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.41130000352859497},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3968999981880188},{"id":"https://openalex.org/C177291462","wikidata":"https://www.wikidata.org/wiki/Q423038","display_name":"Active listening","level":2,"score":0.3781999945640564},{"id":"https://openalex.org/C62230096","wikidata":"https://www.wikidata.org/wiki/Q275969","display_name":"Crowdsourcing","level":2,"score":0.3626999855041504},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3571000099182129},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.3476000130176544},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.325300008058548},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.29899999499320984},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2793999910354614},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.27889999747276306},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C2780646970","wikidata":"https://www.wikidata.org/wiki/Q6980787","display_name":"Natural sounds","level":2,"score":0.26660001277923584},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.2660999894142151},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2648000121116638},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2597000002861023},{"id":"https://openalex.org/C185874996","wikidata":"https://www.wikidata.org/wiki/Q269699","display_name":"Interdependence","level":2,"score":0.25209999084472656},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.25119999051094055},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2026.3651023","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2026.3651023","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"live":[1],"in":[2,84,212,244,256],"a":[3,17,58,132],"rich":[4],"and":[5,30,43,82,106,131,145,195,218,234,260,265,287],"varied":[6],"acoustic":[7,26,99,127],"world,":[8],"which":[9,72],"is":[10,164,181],"experienced":[11],"by":[12,28,96,156,166,274],"individuals":[13],"or":[14],"communities":[15],"as":[16,40,52],"<italic":[18],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[19],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">soundscape</i>.":[20],"Computational":[21],"auditory":[22],"scene":[23],"analysis,":[24,76],"disentangling":[25],"scenes":[27,100],"detecting":[29],"classifying":[31],"events,":[32],"focuses":[33],"on":[34,49,190,215,240],"objective":[35,262],"attributes":[36],"of":[37,125,169,178,186],"sounds,":[38],"such":[39,51],"their":[41,47],"category":[42],"temporal":[44],"characteristics,":[45],"ignoring":[46],"effects":[48],"people,":[50],"the":[53,66,98,107,149,152,158,175,191,196,241,266],"emotions":[54],"they":[55],"evoke":[56],"within":[57],"context.":[59],"To":[60,113],"fill":[61],"this":[62,114],"gap,":[63],"we":[64,116],"propose":[65,117],"affective":[67,110],"soundscape":[68,75,88,95,120,153,188,210,275],"captioning":[69,230,263],"(ASSC)":[70],"task,":[71],"enables":[73],"automated":[74,228],"thus":[77],"avoiding":[78],"labour-intensive":[79],"subjective":[80,258],"ratings":[81],"surveys":[83],"conventional":[85],"methods.":[86],"With":[87],"captioning,":[89],"context-aware":[90],"descriptions":[91],"are":[92,269,290],"generated":[93,267],"for":[94],"capturing":[97],"(ASs),":[101],"audio":[102,229],"events":[103],"(AEs)":[104],"information,":[105],"corresponding":[108],"human":[109,223,257,283],"qualities":[111],"(AQs).":[112],"end,":[115],"an":[118,126],"automatic":[119],"captioner":[121],"(SoundSCaper)":[122],"system":[123],"composed":[124],"model,":[128,278],"i.e.":[129],"SoundAQnet,":[130],"large":[133],"language":[134,247],"model":[135],"(LLM).":[136],"SoundAQnet":[137],"simultaneously":[138],"models":[139],"multi-scale":[140],"information":[141,159],"about":[142],"ASs,":[143],"AEs,":[144],"perceived":[146],"AQs,":[147],"while":[148],"LLM":[150,281],"describes":[151],"with":[154,161,233],"captions":[155,180,268],"parsing":[157],"captured":[160],"SoundAQnet.":[162],"SoundSCaper":[163,208,237,253],"assessed":[165],"two":[167,187],"juries":[168],"32":[170],"people.":[171],"In":[172,205,220],"expert":[173],"evaluation,":[174,207,224],"average":[176],"score":[177],"SoundSCaper-generated":[179],"slightly":[182],"lower":[183],"than":[184],"that":[185],"experts":[189,211],"evaluation":[192,259,288],"set":[193],"D1":[194,217],"external":[197],"mixed":[198],"dataset":[199],"D2,":[200],"but":[201],"not":[202],"statistically":[203],"significant.":[204],"layperson":[206],"outperforms":[209],"several":[213,245],"metrics":[214],"datasets":[216],"D2.":[219],"addition":[221],"to":[222,226,271],"compared":[225],"other":[227],"(AAC)":[231],"systems":[232],"without":[235],"LLM,":[236],"performs":[238,254],"better":[239],"ASSC":[242],"task":[243],"natural":[246],"processing":[248],"(NLP)":[249],"based":[250],"metrics.":[251],"Overall,":[252],"well":[255],"various":[261],"metrics,":[264],"comparable":[270],"those":[272],"annotated":[273],"experts.":[276],"The":[277],"source":[279],"code,":[280],"scripts,":[282],"assessment":[284],"data,":[285],"instructions,":[286],"statistics":[289],"all":[291],"publicly":[292],"available.":[293]},"counts_by_year":[],"updated_date":"2026-03-26T06:05:38.182114","created_date":"2026-01-14T00:00:00"}
