{"id":"https://openalex.org/W4206227781","doi":"https://doi.org/10.23919/eusipco54536.2021.9616257","title":"Watch, Listen, and Answer: Open-Ended VideoQA with Modulated Multi-Stream 3D ConvNets","display_name":"Watch, Listen, and Answer: Open-Ended VideoQA with Modulated Multi-Stream 3D ConvNets","publication_year":2021,"publication_date":"2021-08-23","ids":{"openalex":"https://openalex.org/W4206227781","doi":"https://doi.org/10.23919/eusipco54536.2021.9616257"},"language":"en","primary_location":{"id":"doi:10.23919/eusipco54536.2021.9616257","is_oa":false,"landing_page_url":"https://doi.org/10.23919/eusipco54536.2021.9616257","pdf_url":null,"source":{"id":"https://openalex.org/S4363607854","display_name":"2021 29th European Signal Processing Conference (EUSIPCO)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 29th European Signal Processing Conference (EUSIPCO)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006630406","display_name":"Taiki Miyanishi","orcid":"https://orcid.org/0000-0001-9105-1601"},"institutions":[{"id":"https://openalex.org/I4210126580","display_name":"RIKEN Center for Advanced Intelligence Project","ror":"https://ror.org/03ckxwf91","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210126580"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Taiki Miyanishi","raw_affiliation_strings":["ATR, RIKEN AIP, Kyoto, Japan"],"affiliations":[{"raw_affiliation_string":"ATR, RIKEN AIP, Kyoto, Japan","institution_ids":["https://openalex.org/I4210126580"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5004328317","display_name":"Motoaki Kawanabe","orcid":null},"institutions":[{"id":"https://openalex.org/I4210126580","display_name":"RIKEN Center for Advanced Intelligence Project","ror":"https://ror.org/03ckxwf91","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210126580"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Motoaki Kawanabe","raw_affiliation_strings":["ATR, RIKEN AIP, Kyoto, Japan"],"affiliations":[{"raw_affiliation_string":"ATR, RIKEN AIP, Kyoto, Japan","institution_ids":["https://openalex.org/I4210126580"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5006630406"],"corresponding_institution_ids":["https://openalex.org/I4210126580"],"apc_list":null,"apc_paid":null,"fwci":0.2614,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.65668203,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"706","last_page":"710"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8330796957015991},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.610529899597168},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.5779808759689331},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.5317432284355164},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.5246196985244751},{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.5041972398757935},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.45964694023132324},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.42009860277175903},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.41756346821784973},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.4152255654335022},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3970951437950134}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8330796957015991},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.610529899597168},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.5779808759689331},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.5317432284355164},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.5246196985244751},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.5041972398757935},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.45964694023132324},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.42009860277175903},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.41756346821784973},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.4152255654335022},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3970951437950134},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.23919/eusipco54536.2021.9616257","is_oa":false,"landing_page_url":"https://doi.org/10.23919/eusipco54536.2021.9616257","pdf_url":null,"source":{"id":"https://openalex.org/S4363607854","display_name":"2021 29th European Signal Processing Conference (EUSIPCO)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 29th European Signal Processing Conference (EUSIPCO)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5897545720","display_name":null,"funder_award_id":"JP18KK0284","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G8004422286","display_name":null,"funder_award_id":"JP-MJCR15E2","funder_id":"https://openalex.org/F4320320907","funder_display_name":"Japan Science and Technology Corporation"}],"funders":[{"id":"https://openalex.org/F4320320907","display_name":"Japan Science and Technology Corporation","ror":"https://ror.org/00097mb19"},{"id":"https://openalex.org/F4320334764","display_name":"Japan Society for the Promotion of Science","ror":"https://ror.org/00hhkn466"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":47,"referenced_works":["https://openalex.org/W1836465849","https://openalex.org/W1933349210","https://openalex.org/W2133564696","https://openalex.org/W2194775991","https://openalex.org/W2606982687","https://openalex.org/W2760103357","https://openalex.org/W2765716052","https://openalex.org/W2810643877","https://openalex.org/W2896457183","https://openalex.org/W2899663614","https://openalex.org/W2954199749","https://openalex.org/W2962865004","https://openalex.org/W2962934715","https://openalex.org/W2962949233","https://openalex.org/W2963820951","https://openalex.org/W2964220823","https://openalex.org/W2966238183","https://openalex.org/W2968917279","https://openalex.org/W2974161034","https://openalex.org/W2980339970","https://openalex.org/W2981851635","https://openalex.org/W2985144848","https://openalex.org/W2990503944","https://openalex.org/W2998166190","https://openalex.org/W3002552512","https://openalex.org/W3016658915","https://openalex.org/W3034730770","https://openalex.org/W3035333188","https://openalex.org/W3035497460","https://openalex.org/W3043840704","https://openalex.org/W3094550259","https://openalex.org/W3128726204","https://openalex.org/W4287777632","https://openalex.org/W4297749157","https://openalex.org/W4385245566","https://openalex.org/W6638667902","https://openalex.org/W6679434410","https://openalex.org/W6720468700","https://openalex.org/W6739901393","https://openalex.org/W6740674931","https://openalex.org/W6744865055","https://openalex.org/W6748270630","https://openalex.org/W6755207826","https://openalex.org/W6767164110","https://openalex.org/W6772149162","https://openalex.org/W6777481484","https://openalex.org/W6790679657"],"related_works":["https://openalex.org/W3000097931","https://openalex.org/W2354322770","https://openalex.org/W4237547500","https://openalex.org/W1570848052","https://openalex.org/W2373192430","https://openalex.org/W4239268388","https://openalex.org/W4243305035","https://openalex.org/W1537496349","https://openalex.org/W2379407973","https://openalex.org/W2350267540"],"abstract_inverted_index":{"We":[0,86],"propose":[1],"an":[2,114],"open-ended":[3,24],"multimodal":[4,17,75,147,163,170],"video":[5,45,84,208],"question":[6,127,141],"answering":[7],"(VideoQA)":[8],"method":[9,172,176,184],"that":[10,40,54,182],"predicts":[11,153],"textual":[12],"answers":[13,189],"by":[14,190],"referring":[15],"to":[16,70,129,192],"information":[18,128,148],"derived":[19],"from":[20,33],"videos.":[21],"Most":[22],"current":[23],"VideoQA":[25,158,171],"methods":[26],"focus":[27],"on":[28,64,117,139,177],"motion":[29,107,199],"and":[30,35,58,97,108,121,152,173,200],"appearance":[31,109,201],"features":[32,39,60,76,80,110,134,202],"videos":[34],"ignore":[36],"the":[37,79,136,140,146,154,187,193,198,207],"audio":[38,59,115,194],"are":[41,203],"useful":[42],"for":[43,83,101,205],"understanding":[44,206],"content":[46],"in":[47],"more":[48],"detail.":[49],"A":[50],"few":[51],"prior":[52],"works":[53],"use":[55],"motion,":[56],"appearance,":[57],"showed":[61],"poor":[62],"results":[63],"public":[65],"benchmarks":[66],"since":[67],"they":[68],"failed":[69],"(e.g.,":[71],"region":[72],"or":[73],"grid-level)":[74],"effectively":[77,161],"fuse":[78],"with":[81,90,126],"details":[82],"reasoning.":[85],"overcame":[87],"these":[88],"limitations":[89],"multi-stream":[91],"3-dimensional":[92],"convolutional":[93],"networks":[94],"(3D":[95],"ConvNets)":[96],"a":[98,168,174],"transformer-based":[99],"modulator":[100],"VideoQA.":[102],"Our":[103,157],"network":[104,144],"represents":[105],"detailed":[106],"as":[111,113],"well":[112],"feature":[116],"multiple":[118],"3D":[119,150],"ConvNets":[120,151],"modulates":[122],"each":[123],"intermediate":[124],"representation":[125],"extract":[130],"their":[131],"relevant":[132],"spatiotemporal":[133],"over":[135],"frames.":[137],"Based":[138],"content,":[142],"our":[143,183],"fuses":[145],"of":[149],"final":[155],"answers.":[156],"method,":[159],"which":[160],"combined":[162],"data":[164],"yields,":[165],"outperformed":[166],"both":[167],"previous":[169],"state-of-the-art":[175],"standard":[178],"benchmarks.":[179],"Visualization":[180],"suggests":[181],"can":[185],"predict":[186],"correct":[188],"listening":[191],"information,":[195],"even":[196],"when":[197],"inadequate":[204],"constant.":[209]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
