{"id":"https://openalex.org/W4308222513","doi":"https://doi.org/10.1145/3536221.3556625","title":"Does Audio help in deep Audio-Visual Saliency prediction models?","display_name":"Does Audio help in deep Audio-Visual Saliency prediction models?","publication_year":2022,"publication_date":"2022-11-04","ids":{"openalex":"https://openalex.org/W4308222513","doi":"https://doi.org/10.1145/3536221.3556625"},"language":"en","primary_location":{"id":"doi:10.1145/3536221.3556625","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3536221.3556625","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2022 International Conference on Multimodal Interaction","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100902210","display_name":"Ritvik Agrawal","orcid":null},"institutions":[{"id":"https://openalex.org/I64189192","display_name":"International Institute of Information Technology, Hyderabad","ror":"https://ror.org/05f11g639","country_code":"IN","type":"education","lineage":["https://openalex.org/I64189192"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Ritvik Agrawal","raw_affiliation_strings":["CVIT, KCIS, International Institute for Information Technology, Hyderabad, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CVIT, KCIS, International Institute for Information Technology, Hyderabad, India","institution_ids":["https://openalex.org/I64189192"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039292825","display_name":"Shreyank Jyoti","orcid":null},"institutions":[{"id":"https://openalex.org/I64189192","display_name":"International Institute of Information Technology, Hyderabad","ror":"https://ror.org/05f11g639","country_code":"IN","type":"education","lineage":["https://openalex.org/I64189192"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Shreyank Jyoti","raw_affiliation_strings":["CVIT, KCIS, International Institute for Information Technology, Hyderabad, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CVIT, KCIS, International Institute for Information Technology, Hyderabad, India","institution_ids":["https://openalex.org/I64189192"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070477682","display_name":"Rohit Girmaji","orcid":"https://orcid.org/0009-0000-3361-5894"},"institutions":[{"id":"https://openalex.org/I64189192","display_name":"International Institute of Information Technology, Hyderabad","ror":"https://ror.org/05f11g639","country_code":"IN","type":"education","lineage":["https://openalex.org/I64189192"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Rohit Girmaji","raw_affiliation_strings":["CVIT, KCIS, International Institute for Information Technology, Hyderabad, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CVIT, KCIS, International Institute for Information Technology, Hyderabad, India","institution_ids":["https://openalex.org/I64189192"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082060797","display_name":"Sarath Sivaprasad","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sarath Sivaprasad","raw_affiliation_strings":["CVIT, KCIS, International Institute for Information Technology, Hyderabad, India and TCS Research, Pune, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CVIT, KCIS, International Institute for Information Technology, Hyderabad, India and TCS Research, Pune, India","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5067298540","display_name":"Vineet Gandhi","orcid":"https://orcid.org/0000-0001-8861-7731"},"institutions":[{"id":"https://openalex.org/I64189192","display_name":"International Institute of Information Technology, Hyderabad","ror":"https://ror.org/05f11g639","country_code":"IN","type":"education","lineage":["https://openalex.org/I64189192"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Vineet Gandhi","raw_affiliation_strings":["CVIT, KCIS, International Institute for Information Technology, Hyderabad, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CVIT, KCIS, International Institute for Information Technology, Hyderabad, India","institution_ids":["https://openalex.org/I64189192"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.2973,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.52121717,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"48","last_page":"56"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12032","display_name":"Multisensory perception and integration","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9857000112533569,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.8804715871810913},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8198353052139282},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7748548984527588},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6089147925376892},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.5049014687538147},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5024187564849854},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.490054726600647},{"id":"https://openalex.org/keywords/audio-analyzer","display_name":"Audio analyzer","score":0.4559413194656372},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.4391140043735504},{"id":"https://openalex.org/keywords/conjunction","display_name":"Conjunction (astronomy)","score":0.42460232973098755},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.42096567153930664},{"id":"https://openalex.org/keywords/sensory-cue","display_name":"Sensory cue","score":0.4176754951477051},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3974970579147339},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.37480008602142334},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.2510402500629425},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.2249981164932251}],"concepts":[{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.8804715871810913},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8198353052139282},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7748548984527588},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6089147925376892},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.5049014687538147},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5024187564849854},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.490054726600647},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.4559413194656372},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.4391140043735504},{"id":"https://openalex.org/C59656382","wikidata":"https://www.wikidata.org/wiki/Q191536","display_name":"Conjunction (astronomy)","level":2,"score":0.42460232973098755},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.42096567153930664},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.4176754951477051},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3974970579147339},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.37480008602142334},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.2510402500629425},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.2249981164932251},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C1276947","wikidata":"https://www.wikidata.org/wiki/Q333","display_name":"Astronomy","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3536221.3556625","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3536221.3556625","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2022 International Conference on Multimodal Interaction","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W1152443276","https://openalex.org/W1484037809","https://openalex.org/W1901129140","https://openalex.org/W1978479866","https://openalex.org/W1979472948","https://openalex.org/W1996721809","https://openalex.org/W2004294009","https://openalex.org/W2081106288","https://openalex.org/W2091008902","https://openalex.org/W2101194540","https://openalex.org/W2101461157","https://openalex.org/W2119577735","https://openalex.org/W2128272608","https://openalex.org/W2163292664","https://openalex.org/W2184568381","https://openalex.org/W2213100575","https://openalex.org/W2395371822","https://openalex.org/W2471855951","https://openalex.org/W2500896654","https://openalex.org/W2529272619","https://openalex.org/W2756582047","https://openalex.org/W2808631503","https://openalex.org/W2883429621","https://openalex.org/W2903319259","https://openalex.org/W2962711746","https://openalex.org/W2962965915","https://openalex.org/W2963524571","https://openalex.org/W2986131415","https://openalex.org/W2997304642","https://openalex.org/W3000351820","https://openalex.org/W3011978462","https://openalex.org/W3030831740","https://openalex.org/W3034287518","https://openalex.org/W3097337310","https://openalex.org/W3100302033","https://openalex.org/W3149149291","https://openalex.org/W3175300676","https://openalex.org/W3194334462","https://openalex.org/W3206008172","https://openalex.org/W3206366488","https://openalex.org/W4205423437","https://openalex.org/W4253434585"],"related_works":["https://openalex.org/W2098934641","https://openalex.org/W2494533082","https://openalex.org/W4214771044","https://openalex.org/W4387698063","https://openalex.org/W4382560817","https://openalex.org/W3004352674","https://openalex.org/W1975359510","https://openalex.org/W3110605476","https://openalex.org/W1803351015","https://openalex.org/W2363106653"],"abstract_inverted_index":{"Despite":[0],"existing":[1],"works":[2],"of":[3,35,71,107,124],"Audio-Visual":[4],"Saliency":[5],"Prediction":[6],"(AVSP)":[7],"models":[8,22,85,113],"claiming":[9],"to":[10,24,81,118],"achieve":[11],"promising":[12],"results":[13],"by":[14,48,128],"fusing":[15],"audio":[16,26,36,51,96,108],"modality":[17],"over":[18,91],"visual-only":[19,92],"models,":[20,93],"these":[21],"fail":[23],"leverage":[25],"information.":[27],"In":[28],"this":[29],"paper,":[30],"we":[31,79],"investigate":[32],"the":[33,41,72,95,105,116,125],"relevance":[34],"cues":[37],"in":[38,89,109],"conjunction":[39],"with":[40],"visual":[42],"ones":[43],"and":[44,53,114],"conduct":[45],"extensive":[46],"analysis":[47,62],"employing":[49],"well-established":[50],"modules":[52],"fusion":[54],"techniques":[55],"from":[56],"diverse":[57,65],"correlated":[58],"audio-visual":[59],"tasks.":[60],"Our":[61,102],"on":[63],"ten":[64],"saliency":[66],"datasets":[67],"suggests":[68],"that":[69,130],"none":[70],"methods":[73],"worked":[74],"for":[75,122],"incorporating":[76],"audio.":[77],"Furthermore,":[78],"bring":[80],"light,":[82],"why":[83],"AVSP":[84,112],"show":[86],"a":[87,119],"gain":[88],"performance":[90],"though":[94],"branch":[97],"is":[98],"agnostic":[99],"at":[100],"inference.":[101],"work":[103,133],"questions":[104],"role":[106],"current":[110],"deep":[111],"motivates":[115],"community":[117],"clear":[120],"avenue":[121],"reconsideration":[123],"complex":[126],"architectures":[127],"demonstrating":[129],"simpler":[131],"alternatives":[132],"equally":[134],"well.":[135]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
