{"id":"https://openalex.org/W4401809402","doi":"https://doi.org/10.1109/tmm.2024.3443614","title":"Towards Weakly Supervised Text-to-Audio Grounding","display_name":"Towards Weakly Supervised Text-to-Audio Grounding","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4401809402","doi":"https://doi.org/10.1109/tmm.2024.3443614"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2024.3443614","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2024.3443614","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025827045","display_name":"Xuenan Xu","orcid":"https://orcid.org/0000-0001-8718-1278"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xuenan Xu","raw_affiliation_strings":["Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-8718-1278","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100438492","display_name":"Ziyang Ma","orcid":"https://orcid.org/0000-0002-0623-9114"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziyang Ma","raw_affiliation_strings":["Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081865665","display_name":"Mengyue Wu","orcid":"https://orcid.org/0000-0002-5599-8707"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyue Wu","raw_affiliation_strings":["Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-5599-8707","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043098653","display_name":"Kai Yu","orcid":"https://orcid.org/0000-0002-7102-9826"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Yu","raw_affiliation_strings":["Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-7102-9826","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5025827045"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":2.3179,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.89896436,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"26","issue":null,"first_page":"11126","last_page":"11138"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9854000210762024,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9854000210762024,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9711999893188477,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9419000148773193,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8147042989730835},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4582255780696869},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.4536237120628357},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4110090732574463},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3252565860748291},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.2697618007659912},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.1547386348247528}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8147042989730835},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4582255780696869},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.4536237120628357},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4110090732574463},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3252565860748291},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.2697618007659912},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.1547386348247528}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2024.3443614","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2024.3443614","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G7857932966","display_name":null,"funder_award_id":"92048205","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":64,"referenced_works":["https://openalex.org/W1773149199","https://openalex.org/W1905882502","https://openalex.org/W2112912048","https://openalex.org/W2247513039","https://openalex.org/W2593116425","https://openalex.org/W2891456603","https://openalex.org/W2896457183","https://openalex.org/W2962703144","https://openalex.org/W2963036257","https://openalex.org/W2963610932","https://openalex.org/W2964089981","https://openalex.org/W2968101724","https://openalex.org/W2970641574","https://openalex.org/W2979933490","https://openalex.org/W2987401211","https://openalex.org/W2989176720","https://openalex.org/W3004910802","https://openalex.org/W3006275583","https://openalex.org/W3015190346","https://openalex.org/W3015591594","https://openalex.org/W3023138874","https://openalex.org/W3026041220","https://openalex.org/W3035160371","https://openalex.org/W3094550259","https://openalex.org/W3098232790","https://openalex.org/W3113075965","https://openalex.org/W3117585461","https://openalex.org/W3124216180","https://openalex.org/W3125993195","https://openalex.org/W3162999565","https://openalex.org/W3163843406","https://openalex.org/W3177365819","https://openalex.org/W3185341429","https://openalex.org/W3186567887","https://openalex.org/W3204267711","https://openalex.org/W3205743929","https://openalex.org/W3205945847","https://openalex.org/W3209059054","https://openalex.org/W4224920041","https://openalex.org/W4226442948","https://openalex.org/W4313156423","https://openalex.org/W4367319981","https://openalex.org/W4372260310","https://openalex.org/W4372260505","https://openalex.org/W4375869156","https://openalex.org/W4385227173","https://openalex.org/W4385488967","https://openalex.org/W4385822682","https://openalex.org/W4385823092","https://openalex.org/W4386065620","https://openalex.org/W4386076626","https://openalex.org/W4386609057","https://openalex.org/W4389513749","https://openalex.org/W4390812911","https://openalex.org/W4392251904","https://openalex.org/W4392903801","https://openalex.org/W4400033239","https://openalex.org/W6676647902","https://openalex.org/W6751762309","https://openalex.org/W6769196770","https://openalex.org/W6780218876","https://openalex.org/W6796761347","https://openalex.org/W6797746672","https://openalex.org/W6810007534"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Text-to-audio":[0],"grounding":[1,34],"(TAG)":[2],"task":[3,19],"aims":[4],"to":[5,61,67,109,128,157],"predict":[6],"the":[7,46,85,92,98,130,133,159],"onsets":[8],"and":[9,44,80,96,116,124,136],"offsets":[10],"of":[11,39,48,87,100,132,161],"sound":[12,40],"events":[13,41],"described":[14],"by":[15],"natural":[16],"language.":[17],"This":[18,28],"can":[20,53],"facilitate":[21],"applications":[22],"such":[23],"as":[24],"multimodal":[25],"information":[26],"retrieval.":[27],"paper":[29],"focuses":[30],"on":[31,164],"weakly-supervised":[32],"text-to-audio":[33],"(WSTAG),":[35],"where":[36],"frame-level":[37],"annotations":[38],"are":[42,74,126],"unavailable,":[43],"only":[45],"caption":[47],"a":[49],"whole":[50],"audio":[51,114],"clip":[52],"be":[54],"utilized":[55],"for":[56,118],"training.":[57,119],"WSTAG":[58,72,94,108,150],"is":[59],"superior":[60],"strongly-supervised":[62],"approaches":[63],"in":[64,76,91],"its":[65],"scalability":[66],"large":[68],"audio-text":[69],"datasets.":[70],"Two":[71],"frameworks":[73],"studied":[75],"this":[77],"paper:":[78],"sentence-level":[79],"phrase-level.":[81],"First,":[82],"we":[83,153],"analyze":[84,158],"limitations":[86],"mean":[88],"pooling":[89,102],"used":[90],"previous":[93,149],"approach":[95],"investigate":[97],"effects":[99,160],"different":[101],"strategies.":[103],"We":[104],"then":[105],"propose":[106],"phrase-level":[107,165],"use":[110],"matching":[111],"labels":[112,135],"between":[113],"clips":[115],"phrases":[117],"Advanced":[120],"negative":[121],"sampling":[122],"strategies":[123],"self-supervision":[125],"proposed":[127],"enhance":[129],"accuracy":[131],"weak":[134],"provide":[137],"pseudo":[138],"strong":[139],"labels.":[140],"Experimental":[141],"results":[142],"show":[143],"that":[144],"our":[145],"system":[146],"significantly":[147],"outperforms":[148],"methods.":[151],"Finally,":[152],"conduct":[154],"extensive":[155],"experiments":[156],"several":[162],"factors":[163],"WSTAG.":[166]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":6}],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
