{"id":"https://openalex.org/W4385488967","doi":"https://doi.org/10.1109/icasspw59220.2023.10192960","title":"Investigating Pooling Strategies and Loss Functions for Weakly-Supervised Text-to-Audio Grounding via Contrastive Learning","display_name":"Investigating Pooling Strategies and Loss Functions for Weakly-Supervised Text-to-Audio Grounding via Contrastive Learning","publication_year":2023,"publication_date":"2023-06-04","ids":{"openalex":"https://openalex.org/W4385488967","doi":"https://doi.org/10.1109/icasspw59220.2023.10192960"},"language":"en","primary_location":{"id":"doi:10.1109/icasspw59220.2023.10192960","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icasspw59220.2023.10192960","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025827045","display_name":"Xuenan Xu","orcid":"https://orcid.org/0000-0001-8718-1278"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xuenan Xu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081865665","display_name":"Mengyue Wu","orcid":"https://orcid.org/0000-0002-5599-8707"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyue Wu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043098653","display_name":"Kai Yu","orcid":"https://orcid.org/0000-0002-7102-9826"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Yu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5025827045"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":1.0166,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.75821493,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9936000108718872,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.8693329691886902},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.765978217124939},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.733320951461792},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5143231153488159},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.49404388666152954},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.49054235219955444},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.4727635979652405},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.41268154978752136},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.36414092779159546},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3519476652145386},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.32896867394447327},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.08133041858673096},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.07474473118782043}],"concepts":[{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.8693329691886902},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.765978217124939},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.733320951461792},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5143231153488159},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.49404388666152954},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.49054235219955444},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.4727635979652405},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.41268154978752136},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.36414092779159546},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3519476652145386},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.32896867394447327},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.08133041858673096},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.07474473118782043},{"id":"https://openalex.org/C78458016","wikidata":"https://www.wikidata.org/wiki/Q840400","display_name":"Evolutionary biology","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icasspw59220.2023.10192960","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icasspw59220.2023.10192960","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7400000095367432,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W1773149199","https://openalex.org/W1905882502","https://openalex.org/W2135342008","https://openalex.org/W2591013610","https://openalex.org/W2593116425","https://openalex.org/W2774267535","https://openalex.org/W2803088946","https://openalex.org/W2963610932","https://openalex.org/W2965809241","https://openalex.org/W2997525715","https://openalex.org/W3005680577","https://openalex.org/W3006275583","https://openalex.org/W3015190346","https://openalex.org/W3016059657","https://openalex.org/W3035212740","https://openalex.org/W3162999565","https://openalex.org/W3163843406","https://openalex.org/W3177365819","https://openalex.org/W3204267711","https://openalex.org/W4221157007","https://openalex.org/W4224920041","https://openalex.org/W4298364821","https://openalex.org/W6747225742","https://openalex.org/W6751762309","https://openalex.org/W6774314701","https://openalex.org/W6797746672","https://openalex.org/W6801330939"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W3088136942","https://openalex.org/W2949362007","https://openalex.org/W2775506363","https://openalex.org/W4290852288","https://openalex.org/W4388893791","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W3009270862"],"abstract_inverted_index":{"Text-to-audio":[0],"grounding":[1],"(TAG)":[2],"aims":[3],"to":[4,40,88,107,141,173],"detect":[5,174],"sound":[6,74],"events":[7],"described":[8],"by":[9],"natural":[10],"language":[11],"in":[12,54,69],"an":[13],"audio":[14,44],"clip.":[15],"Strongly-supervised":[16],"TAG":[17,36,42,105,112],"requires":[18],"extensive":[19],"human":[20],"annotations":[21],"of":[22,92,123],"the":[23,30,104,121,131,136,142,146,171],"events\u2019":[24],"on-":[25],"and":[26,59,78,158,189],"off-sets.":[27],"To":[28],"mitigate":[29],"reliance":[31],"on":[32,43,48,126,130],"strongly-annotated":[33],"data,":[34],"weakly-supervised":[35],"(WSTAG)":[37],"is":[38,139],"proposed":[39],"train":[41],"captioning":[45],"data":[46,190],"based":[47],"contrastive":[49],"learning.":[50],"However,":[51],"crucial":[52,140],"components":[53,125],"WSTAG,":[55],"namely":[56],"pooling":[57,137,156],"strategies":[58,157],"loss":[60,147,159],"functions,":[61,160],"remain":[62],"unexplored.":[63],"Directly":[64],"bringing":[65],"their":[66],"corresponding":[67],"ones":[68,179],"closely-related":[70],"tasks,":[71],"such":[72],"as":[73],"event":[75],"detection":[76],"(SED)":[77],"audio-text":[79],"retrieval,":[80],"do":[81],"not":[82],"necessarily":[83],"fit":[84],"this":[85,99],"task":[86],"due":[87],"TAG\u2019s":[89],"unique":[90],"requirement":[91],"fine-grained":[93],"alignment":[94],"via":[95],"free":[96],"text.":[97],"In":[98],"work,":[100],"we":[101,118,161],"first":[102],"improve":[103],"dataset":[106,133],"obtain":[108],"a":[109,163],"more":[110,164],"reliable":[111],"performance":[113,144],"indicator,":[114],"AudioGrounding":[115],"v2.":[116],"Then":[117],"extensively":[119],"investigate":[120],"effects":[122],"these":[124],"WSTAG.":[127],"The":[128,187],"result":[129],"refined":[132],"demonstrates":[134],"that":[135,168],"strategy":[138],"model":[143],"while":[145],"function":[148],"presents":[149],"much":[150],"less":[151],"influence.":[152],"By":[153],"combining":[154],"proper":[155],"explore":[162],"effective":[165],"WSTAG":[166],"framework":[167],"significantly":[169],"enhances":[170],"ability":[172],"events,":[175],"especially":[176],"for":[177],"short-duration":[178],"<sup":[180,184],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[181,185],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[182,186],".":[183],"code":[188],"are":[191],"available":[192],"athttps://github.com/wsntxxn/TextToAudioGrounding":[193]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
