{"id":"https://openalex.org/W4400489963","doi":"https://doi.org/10.1109/cscwd61410.2024.10580534","title":"Language-based Audio Retrieval with GPT-Augmented Captions and Self-Attended Audio Clips","display_name":"Language-based Audio Retrieval with GPT-Augmented Captions and Self-Attended Audio Clips","publication_year":2024,"publication_date":"2024-05-08","ids":{"openalex":"https://openalex.org/W4400489963","doi":"https://doi.org/10.1109/cscwd61410.2024.10580534"},"language":"en","primary_location":{"id":"doi:10.1109/cscwd61410.2024.10580534","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cscwd61410.2024.10580534","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 27th International Conference on Computer Supported Cooperative Work in Design (CSCWD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101254357","display_name":"Fuyu Gu","orcid":null},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Fuyu Gu","raw_affiliation_strings":["Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China"],"affiliations":[{"raw_affiliation_string":"Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China","institution_ids":["https://openalex.org/I69356397"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029479817","display_name":"Yang Gu","orcid":"https://orcid.org/0000-0003-2291-8579"},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Gu","raw_affiliation_strings":["Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China"],"affiliations":[{"raw_affiliation_string":"Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China","institution_ids":["https://openalex.org/I69356397"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101395822","display_name":"Yiyan Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yiyan Xu","raw_affiliation_strings":["Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China"],"affiliations":[{"raw_affiliation_string":"Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China","institution_ids":["https://openalex.org/I69356397"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100527544","display_name":"Sun Hao-ran","orcid":"https://orcid.org/0000-0002-0811-995X"},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoran Sun","raw_affiliation_strings":["Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China"],"affiliations":[{"raw_affiliation_string":"Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China","institution_ids":["https://openalex.org/I69356397"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044105477","display_name":"Yushan Pan","orcid":"https://orcid.org/0000-0002-6877-3937"},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yushan Pan","raw_affiliation_strings":["Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China"],"affiliations":[{"raw_affiliation_string":"Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China","institution_ids":["https://openalex.org/I69356397"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024084275","display_name":"Shengchen Li","orcid":"https://orcid.org/0000-0002-2488-298X"},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengchen Li","raw_affiliation_strings":["Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China"],"affiliations":[{"raw_affiliation_string":"Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China","institution_ids":["https://openalex.org/I69356397"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100392409","display_name":"Haiyang Zhang","orcid":"https://orcid.org/0000-0002-3025-9609"},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haiyang Zhang","raw_affiliation_strings":["Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China"],"affiliations":[{"raw_affiliation_string":"Xi&#x2019;an Jiaotong Liverpool University,School of Advanced Technology,Suzhou,China","institution_ids":["https://openalex.org/I69356397"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5101254357"],"corresponding_institution_ids":["https://openalex.org/I69356397"],"apc_list":null,"apc_paid":null,"fwci":1.4942,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.82079563,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"858","last_page":"863"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13996","display_name":"Diverse Musicological Studies","score":0.9498000144958496,"subfield":{"id":"https://openalex.org/subfields/1210","display_name":"Music"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9305999875068665,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/clips","display_name":"CLIPS","score":0.8681225776672363},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7883548736572266},{"id":"https://openalex.org/keywords/audio-analyzer","display_name":"Audio analyzer","score":0.5501660704612732},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5317189693450928},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.47086086869239807},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.36257416009902954},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3206779360771179},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.29196643829345703},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.2675144672393799}],"concepts":[{"id":"https://openalex.org/C2778739407","wikidata":"https://www.wikidata.org/wiki/Q165372","display_name":"CLIPS","level":2,"score":0.8681225776672363},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7883548736572266},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.5501660704612732},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5317189693450928},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.47086086869239807},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.36257416009902954},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3206779360771179},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.29196643829345703},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.2675144672393799}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cscwd61410.2024.10580534","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cscwd61410.2024.10580534","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 27th International Conference on Computer Supported Cooperative Work in Design (CSCWD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7300000190734863,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2098934641","https://openalex.org/W3135871143","https://openalex.org/W4382560817","https://openalex.org/W2347715673","https://openalex.org/W2405875744","https://openalex.org/W2020966186","https://openalex.org/W296599067","https://openalex.org/W4387698063","https://openalex.org/W27741130","https://openalex.org/W2494533082"],"abstract_inverted_index":{"With":[0],"the":[1,52,129,143,153,158],"explosion":[2],"of":[3,44],"user-generated":[4],"content":[5,17],"in":[6,137,150],"recent":[7],"years,":[8],"efficient":[9],"methods":[10],"for":[11,48,71],"organizing":[12],"multimedia":[13],"databases":[14],"based":[15,34],"on":[16,35,105,142,152],"and":[18,100,112,146],"retrieving":[19],"relevant":[20,31],"items":[21],"have":[22],"become":[23],"essential.":[24],"Language-based":[25],"audio":[26,32,73,84,92],"retrieval":[27,61,74],"seeks":[28],"to":[29,59,82,95],"find":[30],"clips":[33],"natural":[36],"language":[37,53,88],"queries.":[38],"However,":[39],"there":[40],"exists":[41],"a":[42,68,134,147],"scarcity":[43],"datasets":[45],"specifically":[46],"developed":[47],"this":[49,64],"task.":[50],"Moreover,":[51],"annotations":[54],"often":[55],"carry":[56],"biases,":[57],"leading":[58],"unsatisfactory":[60],"accuracy.":[62],"In":[63],"work,":[65],"we":[66],"propose":[67],"novel":[69],"framework":[70,118,131],"language-based":[72],"that":[75,116],"aims":[76],"to:":[77],"1)":[78],"utilize":[79],"GPT-generated":[80],"text":[81],"augment":[83],"captions,":[85],"thereby":[86],"improving":[87],"diversity;":[89],"2)":[90],"employ":[91],"self-attention":[93],"mechanisms":[94],"capture":[96],"intricate":[97],"acoustic":[98],"features":[99],"temporal":[101],"dependencies.":[102],"Experiments":[103],"conducted":[104],"two":[106],"public":[107],"datasets,":[108],"containing":[109],"both":[110],"short-":[111],"long-term":[113],"audios,":[114],"demonstrate":[115],"our":[117],"can":[119,132],"achieve":[120,133],"significant":[121],"performance":[122],"improvements":[123],"compared":[124,156],"with":[125,157],"other":[126],"methods.":[127],"Specifically,":[128],"proposed":[130],"27%":[135],"increase":[136],"mean":[138],"average":[139],"precision":[140],"(mAP)":[141],"Clotho":[144],"dataset,":[145],"31%":[148],"improvement":[149],"mAP":[151],"AudioCaps":[154],"dataset":[155],"baseline.":[159]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1}],"updated_date":"2025-12-26T23:08:49.675405","created_date":"2025-10-10T00:00:00"}
