{"id":"https://openalex.org/W3138095408","doi":"https://doi.org/10.1109/iros51168.2021.9635989","title":"ViNet: Pushing the limits of Visual Modality for Audio-Visual Saliency Prediction","display_name":"ViNet: Pushing the limits of Visual Modality for Audio-Visual Saliency Prediction","publication_year":2021,"publication_date":"2021-09-27","ids":{"openalex":"https://openalex.org/W3138095408","doi":"https://doi.org/10.1109/iros51168.2021.9635989","mag":"3138095408"},"language":"en","primary_location":{"id":"doi:10.1109/iros51168.2021.9635989","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros51168.2021.9635989","pdf_url":null,"source":{"id":"https://openalex.org/S4363607734","display_name":"2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101608990","display_name":"Samyak Jain","orcid":null},"institutions":[{"id":"https://openalex.org/I64189192","display_name":"International Institute of Information Technology, Hyderabad","ror":"https://ror.org/05f11g639","country_code":"IN","type":"education","lineage":["https://openalex.org/I64189192"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Samyak Jain","raw_affiliation_strings":["International Institute for Information Technology, Hyderabad"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"International Institute for Information Technology, Hyderabad","institution_ids":["https://openalex.org/I64189192"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019427210","display_name":"Pradeep Yarlagadda","orcid":null},"institutions":[{"id":"https://openalex.org/I64189192","display_name":"International Institute of Information Technology, Hyderabad","ror":"https://ror.org/05f11g639","country_code":"IN","type":"education","lineage":["https://openalex.org/I64189192"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Pradeep Yarlagadda","raw_affiliation_strings":["International Institute for Information Technology, Hyderabad"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"International Institute for Information Technology, Hyderabad","institution_ids":["https://openalex.org/I64189192"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039292825","display_name":"Shreyank Jyoti","orcid":null},"institutions":[{"id":"https://openalex.org/I64189192","display_name":"International Institute of Information Technology, Hyderabad","ror":"https://ror.org/05f11g639","country_code":"IN","type":"education","lineage":["https://openalex.org/I64189192"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Shreyank Jyoti","raw_affiliation_strings":["International Institute for Information Technology, Hyderabad"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"International Institute for Information Technology, Hyderabad","institution_ids":["https://openalex.org/I64189192"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100764522","display_name":"Shyamgopal Karthik","orcid":null},"institutions":[{"id":"https://openalex.org/I64189192","display_name":"International Institute of Information Technology, Hyderabad","ror":"https://ror.org/05f11g639","country_code":"IN","type":"education","lineage":["https://openalex.org/I64189192"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Shyamgopal Karthik","raw_affiliation_strings":["International Institute for Information Technology, Hyderabad"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"International Institute for Information Technology, Hyderabad","institution_ids":["https://openalex.org/I64189192"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089971631","display_name":"Ramanathan Subramanian","orcid":"https://orcid.org/0000-0001-9441-7074"},"institutions":[{"id":"https://openalex.org/I188329596","display_name":"University of Canberra","ror":"https://ror.org/04s1nv328","country_code":"AU","type":"education","lineage":["https://openalex.org/I188329596"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Ramanathan Subramanian","raw_affiliation_strings":["University of Canberra"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Canberra","institution_ids":["https://openalex.org/I188329596"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5067298540","display_name":"Vineet Gandhi","orcid":"https://orcid.org/0000-0001-8861-7731"},"institutions":[{"id":"https://openalex.org/I64189192","display_name":"International Institute of Information Technology, Hyderabad","ror":"https://ror.org/05f11g639","country_code":"IN","type":"education","lineage":["https://openalex.org/I64189192"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Vineet Gandhi","raw_affiliation_strings":["International Institute for Information Technology, Hyderabad"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"International Institute for Information Technology, Hyderabad","institution_ids":["https://openalex.org/I64189192"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.004,"has_fulltext":false,"cited_by_count":85,"citation_normalized_percentile":{"value":0.95587393,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"3520","last_page":"3527"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9921000003814697,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9912999868392944,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.7994482517242432},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6812177300453186},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.599734902381897},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5860791802406311},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5036739706993103},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5020802021026611},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.43852102756500244},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.4215494990348816},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.42010465264320374},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.37103471159935},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.12219130992889404}],"concepts":[{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.7994482517242432},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6812177300453186},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.599734902381897},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5860791802406311},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5036739706993103},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5020802021026611},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.43852102756500244},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.4215494990348816},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.42010465264320374},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.37103471159935},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.12219130992889404},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iros51168.2021.9635989","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros51168.2021.9635989","pdf_url":null,"source":{"id":"https://openalex.org/S4363607734","display_name":"2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6899999976158142,"display_name":"Sustainable cities and communities","id":"https://metadata.un.org/sdg/11"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":71,"referenced_works":["https://openalex.org/W1152443276","https://openalex.org/W1538842650","https://openalex.org/W1545595850","https://openalex.org/W1901129140","https://openalex.org/W1934890906","https://openalex.org/W1978479866","https://openalex.org/W1979472948","https://openalex.org/W1988712925","https://openalex.org/W2004294009","https://openalex.org/W2015394094","https://openalex.org/W2034197131","https://openalex.org/W2038121453","https://openalex.org/W2081106288","https://openalex.org/W2091008902","https://openalex.org/W2099329509","https://openalex.org/W2101194540","https://openalex.org/W2101461157","https://openalex.org/W2119577735","https://openalex.org/W2135835174","https://openalex.org/W2143688352","https://openalex.org/W2155479901","https://openalex.org/W2163292664","https://openalex.org/W2313180542","https://openalex.org/W2471855951","https://openalex.org/W2498738402","https://openalex.org/W2529272619","https://openalex.org/W2533370895","https://openalex.org/W2612135493","https://openalex.org/W2619697695","https://openalex.org/W2755981968","https://openalex.org/W2807186921","https://openalex.org/W2883429621","https://openalex.org/W2903319259","https://openalex.org/W2946520073","https://openalex.org/W2954215542","https://openalex.org/W2955060956","https://openalex.org/W2962711746","https://openalex.org/W2962756039","https://openalex.org/W2962960500","https://openalex.org/W2962965915","https://openalex.org/W2963115079","https://openalex.org/W2963503775","https://openalex.org/W2963581854","https://openalex.org/W2963680395","https://openalex.org/W2964109005","https://openalex.org/W2969741484","https://openalex.org/W2986131415","https://openalex.org/W2997304642","https://openalex.org/W2997360720","https://openalex.org/W3000351820","https://openalex.org/W3003643168","https://openalex.org/W3011154664","https://openalex.org/W3011978462","https://openalex.org/W3030831740","https://openalex.org/W3034287518","https://openalex.org/W3082657571","https://openalex.org/W3090763167","https://openalex.org/W3094461033","https://openalex.org/W3097337310","https://openalex.org/W3122238731","https://openalex.org/W3132652652","https://openalex.org/W4293665662","https://openalex.org/W6632051028","https://openalex.org/W6639824700","https://openalex.org/W6698589804","https://openalex.org/W6704477683","https://openalex.org/W6729831399","https://openalex.org/W6744446066","https://openalex.org/W6763249567","https://openalex.org/W6764953915","https://openalex.org/W6772514828"],"related_works":["https://openalex.org/W2159052453","https://openalex.org/W3013693939","https://openalex.org/W2566616303","https://openalex.org/W3131327266","https://openalex.org/W4297051394","https://openalex.org/W2752972570","https://openalex.org/W2734887215","https://openalex.org/W2803255133","https://openalex.org/W2909431601","https://openalex.org/W2221419418"],"abstract_inverted_index":{"We":[0,117],"propose":[1],"the":[2,29,73,95,102,111,130,138,143,148,153,162],"ViNet":[3,9,50,63,89,123],"architecture":[4,48,124],"for":[5,25,101,167,187],"audio-visual":[6,75,87,168,180],"saliency":[7,33,76,169,181],"prediction.":[8,170],"is":[10,51,55,110],"a":[11,22,32,120,184,193],"fully":[12],"convolutional":[13],"encoder-decoder":[14],"architecture.":[15],"The":[16,46,197],"encoder":[17],"uses":[18],"visual":[19],"features":[20,42,128],"from":[21,43],"network":[23,139],"trained":[24],"action":[26],"recognition,":[27],"and":[28,38,57,70,85,98,105,146,199],"decoder":[30],"infers":[31],"map":[34],"via":[35],"trilinear":[36],"interpolation":[37],"3D":[39],"convolutions,":[40],"combining":[41],"multiple":[44],"hierarchies.":[45],"overall":[47],"of":[49,122,152],"conceptually":[52],"simple;":[53],"it":[54,109],"causal":[56],"runs":[58],"in":[59,161,192],"real-time":[60],"(60":[61],"fps).":[62],"does":[64],"not":[65],"use":[66],"audio":[67,127,145,191],"as":[68],"input":[69,144],"still":[71],"outperforms":[72],"state-of-the-art":[74,164],"prediction":[77],"models":[78,165,201],"on":[79,94,177],"nine":[80],"different":[81],"datasets":[82],"(three":[83],"visual-only":[84],"six":[86],"datasets).":[88],"also":[90,118,157],"surpasses":[91],"human":[92],"performance":[93],"CC,":[96],"SIM":[97],"AUC":[99],"metrics":[100],"AVE":[103],"dataset,":[104],"to":[106,114,142],"our":[107,133],"knowledge,":[108],"first":[112],"model":[113],"do":[115],"so.":[116],"explore":[119],"variation":[121],"by":[125],"augmenting":[126],"into":[129],"decoder.":[131],"To":[132],"surprise,":[134],"upon":[135],"sufficient":[136],"training,":[137],"becomes":[140],"agnostic":[141],"provides":[147],"same":[149],"output":[150],"irrespective":[151],"input.":[154],"Interestingly,":[155],"we":[156],"observe":[158],"similar":[159],"behaviour":[160],"previous":[163,175],"[1]":[166],"Our":[171],"findings":[172],"contrast":[173],"with":[174],"works":[176],"deep":[178],"learning-based":[179],"prediction,":[182],"suggesting":[183],"clear":[185],"avenue":[186],"future":[188],"explorations":[189],"incorporating":[190],"more":[194],"effective":[195],"manner.":[196],"code":[198],"pre-trained":[200],"are":[202],"available":[203],"at":[204],"https://github.com/samyak0210/ViNet.":[205]},"counts_by_year":[{"year":2026,"cited_by_count":8},{"year":2025,"cited_by_count":16},{"year":2024,"cited_by_count":26},{"year":2023,"cited_by_count":25},{"year":2022,"cited_by_count":4},{"year":2021,"cited_by_count":6}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
