{"id":"https://openalex.org/W3163843406","doi":"https://doi.org/10.1109/icassp39728.2021.9414834","title":"Text-to-Audio Grounding: Building Correspondence Between Captions and Sound Events","display_name":"Text-to-Audio Grounding: Building Correspondence Between Captions and Sound Events","publication_year":2021,"publication_date":"2021-05-13","ids":{"openalex":"https://openalex.org/W3163843406","doi":"https://doi.org/10.1109/icassp39728.2021.9414834","mag":"3163843406"},"language":"en","primary_location":{"id":"doi:10.1109/icassp39728.2021.9414834","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414834","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025827045","display_name":"Xuenan Xu","orcid":"https://orcid.org/0000-0001-8718-1278"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xuenan Xu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence SpeechLab,Department of Computer Science and Engineering AI Institute,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence SpeechLab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080154283","display_name":"Heinrich Dinkel","orcid":"https://orcid.org/0000-0003-4330-8980"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Heinrich Dinkel","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence SpeechLab,Department of Computer Science and Engineering AI Institute,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence SpeechLab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081865665","display_name":"Mengyue Wu","orcid":"https://orcid.org/0000-0002-5599-8707"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyue Wu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence SpeechLab,Department of Computer Science and Engineering AI Institute,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence SpeechLab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043098653","display_name":"Kai Yu","orcid":"https://orcid.org/0000-0002-7102-9826"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Yu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence SpeechLab,Department of Computer Science and Engineering AI Institute,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence SpeechLab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5025827045"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":2.4689,"has_fulltext":false,"cited_by_count":21,"citation_normalized_percentile":{"value":0.8984422,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"606","last_page":"610"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.989799976348877,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7382395267486572},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.6524832248687744},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.5922408103942871},{"id":"https://openalex.org/keywords/timestamp","display_name":"Timestamp","score":0.5878236293792725},{"id":"https://openalex.org/keywords/polyphony","display_name":"Polyphony","score":0.575247585773468},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5714160799980164},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5511674880981445},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.5441019535064697},{"id":"https://openalex.org/keywords/sound","display_name":"Sound (geography)","score":0.5355489253997803},{"id":"https://openalex.org/keywords/audio-analyzer","display_name":"Audio analyzer","score":0.4702908992767334},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.45087406039237976},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.4413268566131592},{"id":"https://openalex.org/keywords/sound-recording-and-reproduction","display_name":"Sound recording and reproduction","score":0.4185277223587036},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.311021089553833},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.10477325320243835},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.10455435514450073},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.10260313749313354},{"id":"https://openalex.org/keywords/real-time-computing","display_name":"Real-time computing","score":0.06643226742744446}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7382395267486572},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.6524832248687744},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.5922408103942871},{"id":"https://openalex.org/C113954288","wikidata":"https://www.wikidata.org/wiki/Q186885","display_name":"Timestamp","level":2,"score":0.5878236293792725},{"id":"https://openalex.org/C128979739","wikidata":"https://www.wikidata.org/wiki/Q179465","display_name":"Polyphony","level":2,"score":0.575247585773468},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5714160799980164},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5511674880981445},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.5441019535064697},{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.5355489253997803},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.4702908992767334},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.45087406039237976},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.4413268566131592},{"id":"https://openalex.org/C128422554","wikidata":"https://www.wikidata.org/wiki/Q20077126","display_name":"Sound recording and reproduction","level":2,"score":0.4185277223587036},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.311021089553833},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.10477325320243835},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.10455435514450073},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.10260313749313354},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.06643226742744446},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp39728.2021.9414834","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414834","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.8199999928474426}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1773149199","https://openalex.org/W2143017621","https://openalex.org/W2247513039","https://openalex.org/W2408239454","https://openalex.org/W2593116425","https://openalex.org/W2803088946","https://openalex.org/W2884011836","https://openalex.org/W2916103538","https://openalex.org/W2940092410","https://openalex.org/W2945761034","https://openalex.org/W2962870068","https://openalex.org/W2963080533","https://openalex.org/W2963614783","https://openalex.org/W2964213897","https://openalex.org/W2968101724","https://openalex.org/W2999541181","https://openalex.org/W3015190346","https://openalex.org/W3038899388","https://openalex.org/W3098232790","https://openalex.org/W3124216180","https://openalex.org/W3125993195","https://openalex.org/W4289329167","https://openalex.org/W6751762309","https://openalex.org/W6753516609","https://openalex.org/W6769050543","https://openalex.org/W6780379688"],"related_works":["https://openalex.org/W2098934641","https://openalex.org/W1966390704","https://openalex.org/W4390482300","https://openalex.org/W1590604789","https://openalex.org/W4243477106","https://openalex.org/W2511110724","https://openalex.org/W2494533082","https://openalex.org/W3135871143","https://openalex.org/W4382560817","https://openalex.org/W2197119771"],"abstract_inverted_index":{"Automated":[0],"Audio":[1],"Captioning":[2],"is":[3,94],"a":[4,104],"cross-modal":[5],"task,":[6,79],"generating":[7],"natural":[8],"language":[9,89],"descriptions":[10],"to":[11],"summarize":[12],"the":[13,20,25,49,55,62,75,83],"audio":[14,27,86],"clips\u2019":[15],"sound":[16,22,52,68],"events.":[17],"However,":[18],"grounding":[19,77],"actual":[21],"events":[23,53],"in":[24,58,97],"given":[26],"based":[28],"on":[29,71],"its":[30],"corresponding":[31],"caption":[32],"has":[33],"not":[34],"been":[35],"investigated.":[36],"This":[37],"paper":[38],"contributes":[39],"an":[40,98],"Audio-Grounding":[41],"dataset":[42],"<sup":[43],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[44],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[45],",":[46],"which":[47,80],"provides":[48],"correspondence":[50],"be-tween":[51,85],"and":[54,88,103],"captions":[56],"provided":[57],"Audiocaps,":[59],"along":[60],"with":[61],"location":[63],"(timestamps)":[64],"of":[65,101,111],"each":[66],"present":[67],"event.":[69],"Based":[70],"such,":[72],"we":[73],"propose":[74],"text-to-audio":[76],"(TAG)":[78],"interactively":[81],"considers":[82],"relationship":[84],"processing":[87],"understanding.":[90],"A":[91],"base-line":[92],"approach":[93],"provided,":[95],"resulting":[96],"event-F1":[99],"score":[100,110],"28.3%":[102],"Polyphonic":[105],"Sound":[106],"Detection":[107],"Score":[108],"(PSDS)":[109],"14.7%.":[112]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":4},{"year":2021,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
