{"id":"https://openalex.org/W3155372489","doi":"https://doi.org/10.21437/interspeech.2021-2135","title":"Cross-Modal Learning for Audio-Visual Video Parsing","display_name":"Cross-Modal Learning for Audio-Visual Video Parsing","publication_year":2021,"publication_date":"2021-08-27","ids":{"openalex":"https://openalex.org/W3155372489","doi":"https://doi.org/10.21437/interspeech.2021-2135","mag":"3155372489"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2021-2135","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-2135","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2104.04598","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010840859","display_name":"Jatin Lamba","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jatin Lamba","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"- Abhishek","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"- Abhishek","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064447033","display_name":"Jayaprakash Akula","orcid":"https://orcid.org/0000-0002-1612-1064"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jayaprakash Akula","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089712643","display_name":"Rishabh Dabral","orcid":"https://orcid.org/0009-0004-1245-4146"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rishabh Dabral","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036738038","display_name":"Preethi Jyothi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Preethi Jyothi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5089606464","display_name":"Ganesh Ramakrishnan","orcid":"https://orcid.org/0000-0003-4533-2490"},"institutions":[{"id":"https://openalex.org/I162827531","display_name":"Indian Institute of Technology Bombay","ror":"https://ror.org/02qyf5152","country_code":"IN","type":"education","lineage":["https://openalex.org/I162827531"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Ganesh Ramakrishnan","raw_affiliation_strings":["Indian Institute of Technology Bombay"],"affiliations":[{"raw_affiliation_string":"Indian Institute of Technology Bombay","institution_ids":["https://openalex.org/I162827531"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5010840859"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.03543018,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1937","last_page":"1941"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.804511547088623},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.7720412611961365},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.6631086468696594},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6526001691818237},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.5916232466697693},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5655993819236755},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5520999431610107},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5406460762023926},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5227192044258118},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.5177892446517944},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3964683413505554},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.325761616230011},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.19968006014823914}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.804511547088623},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.7720412611961365},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.6631086468696594},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6526001691818237},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.5916232466697693},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5655993819236755},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5520999431610107},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5406460762023926},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5227192044258118},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.5177892446517944},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3964683413505554},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.325761616230011},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.19968006014823914},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.21437/interspeech.2021-2135","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-2135","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2104.04598","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2104.04598","pdf_url":"https://arxiv.org/pdf/2104.04598","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:3155372489","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2104.04598.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2104.04598","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2104.04598","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2104.04598","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2104.04598","pdf_url":"https://arxiv.org/pdf/2104.04598","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320323448","display_name":"Indian Institute of Technology Bombay","ror":"https://ror.org/02qyf5152"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W3155372489.pdf"},"referenced_works_count":16,"referenced_works":["https://openalex.org/W2081408912","https://openalex.org/W2168272426","https://openalex.org/W2194775991","https://openalex.org/W2417429787","https://openalex.org/W2593116425","https://openalex.org/W2962960500","https://openalex.org/W2964048159","https://openalex.org/W2964109005","https://openalex.org/W2997909293","https://openalex.org/W3020933450","https://openalex.org/W3021108376","https://openalex.org/W3090449556","https://openalex.org/W3093145643","https://openalex.org/W3118120400","https://openalex.org/W3123318516","https://openalex.org/W3175335326"],"related_works":["https://openalex.org/W3118120400","https://openalex.org/W2968808972","https://openalex.org/W2908965201","https://openalex.org/W3175335326","https://openalex.org/W2950864153","https://openalex.org/W3118548710","https://openalex.org/W3197980617","https://openalex.org/W3159920639","https://openalex.org/W2770520232","https://openalex.org/W3034742263","https://openalex.org/W2785892019","https://openalex.org/W2985144848","https://openalex.org/W3161349773","https://openalex.org/W3048065599","https://openalex.org/W3207922251","https://openalex.org/W2963715927","https://openalex.org/W3110606395","https://openalex.org/W3197115220","https://openalex.org/W3081040707","https://openalex.org/W3205716523"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3,102],"present":[4,87,119],"a":[5,19],"novel":[6],"approach":[7,30],"to":[8,81,122],"the":[9,33,53,92,104,124],"audio-visual":[10],"video":[11,20],"parsing":[12,29],"(AVVP)":[13],"task":[14],"that":[15,101],"demarcates":[16],"events":[17],"from":[18,52],"separately":[21],"for":[22,115],"audio":[23],"and":[24,40,64,95,99,130],"visual":[25],"modalities.":[26],"The":[27],"proposed":[28,114],"simultaneously":[31],"detects":[32],"temporal":[34],"boundaries":[35],"in":[36],"terms":[37],"of":[38,43,126],"start":[39],"end":[41],"times":[42],"such":[44],"events.":[45],"We":[46,86,117],"show":[47,100],"how":[48],"AVVP":[49],"can":[50],"benefit":[51],"following":[54],"techniques":[55],"geared":[56],"towards":[57],"effective":[58],"cross-modal":[59,83],"learning:":[60],"(i)":[61],"adversarial":[62,131],"training":[63],"skip":[65],"connections":[66],"(ii)":[67],"global":[68,128],"context":[69],"aware":[70],"attention":[71,129],"and,":[72],"(iii)":[73],"self-supervised":[74],"pretraining":[75],"using":[76],"an":[77],"audio-video":[78,84],"grounding":[79],"objective":[80],"obtain":[82],"representations.":[85],"extensive":[88],"experimental":[89],"evaluations":[90],"on":[91,110],"Look,":[93],"Listen,":[94],"Parse":[96],"(LLP)":[97],"dataset":[98],"outperform":[103],"state-of-the-art":[105],"Hybrid":[106],"Attention":[107],"Network":[108],"(HAN)":[109],"all":[111],"five":[112],"metrics":[113],"AVVP.":[116],"also":[118],"several":[120],"ablations":[121],"validate":[123],"effect":[125],"pretraining,":[127],"training.":[132]},"counts_by_year":[],"updated_date":"2026-04-16T08:26:57.006410","created_date":"2025-10-10T00:00:00"}
