{"id":"https://openalex.org/W4205261319","doi":"https://doi.org/10.1109/tpami.2021.3139957","title":"Depth and Video Segmentation Based Visual Attention for Embodied Question Answering","display_name":"Depth and Video Segmentation Based Visual Attention for Embodied Question Answering","publication_year":2022,"publication_date":"2022-01-04","ids":{"openalex":"https://openalex.org/W4205261319","doi":"https://doi.org/10.1109/tpami.2021.3139957","pmid":"https://pubmed.ncbi.nlm.nih.gov/34982673"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2021.3139957","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2021.3139957","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Haonan Luo","orcid":"https://orcid.org/0000-0002-9121-2687"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Haonan Luo","raw_affiliation_strings":["School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, Jiangsu, China"],"raw_orcid":"https://orcid.org/0000-0002-9121-2687","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, Jiangsu, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Guosheng Lin","orcid":"https://orcid.org/0000-0002-0329-7458"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Guosheng Lin","raw_affiliation_strings":["School of Computer Science and Engineering, Nanyang Technological University, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-0329-7458","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yazhou Yao","orcid":"https://orcid.org/0000-0002-0337-9410"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yazhou Yao","raw_affiliation_strings":["School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, Jiangsu, China"],"raw_orcid":"https://orcid.org/0000-0002-0337-9410","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, Jiangsu, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Fayao Liu","orcid":"https://orcid.org/0000-0001-6649-7660"},"institutions":[{"id":"https://openalex.org/I115228651","display_name":"Agency for Science, Technology and Research","ror":"https://ror.org/036wvzt09","country_code":"SG","type":"government","lineage":["https://openalex.org/I115228651"]},{"id":"https://openalex.org/I3005327000","display_name":"Institute for Infocomm Research","ror":"https://ror.org/053rfa017","country_code":"SG","type":"facility","lineage":["https://openalex.org/I115228651","https://openalex.org/I3005327000","https://openalex.org/I91275662"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Fayao Liu","raw_affiliation_strings":["Institute for Infocomm Research A*STAR, Singapore"],"raw_orcid":"https://orcid.org/0000-0001-6649-7660","affiliations":[{"raw_affiliation_string":"Institute for Infocomm Research A*STAR, Singapore","institution_ids":["https://openalex.org/I3005327000","https://openalex.org/I115228651"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zichuan Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Zichuan Liu","raw_affiliation_strings":["School of Computer Science and Engineering, Nanyang Technological University, Singapore"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":null,"display_name":"Zhenmin Tang","orcid":"https://orcid.org/0000-0001-6708-2205"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenmin Tang","raw_affiliation_strings":["School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, Jiangsu, China"],"raw_orcid":"https://orcid.org/0000-0001-6708-2205","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, Jiangsu, China","institution_ids":["https://openalex.org/I36399199"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I36399199"],"apc_list":null,"apc_paid":null,"fwci":1.7257,"has_fulltext":false,"cited_by_count":18,"citation_normalized_percentile":{"value":0.85217264,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":"45","issue":"6","first_page":"6807","last_page":"6819"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9811999797821045,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9811999797821045,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.005100000184029341,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.002099999925121665,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.8432999849319458},{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.7603999972343445},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6940000057220459},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.5217000246047974},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5009999871253967},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.46470001339912415},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.4275999963283539},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4172999858856201}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.8432999849319458},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8367999792098999},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.7603999972343445},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6940000057220459},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6728000044822693},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.5217000246047974},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5009999871253967},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.46470001339912415},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.4275999963283539},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.42320001125335693},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4172999858856201},{"id":"https://openalex.org/C173414695","wikidata":"https://www.wikidata.org/wiki/Q5510276","display_name":"Fusion mechanism","level":4,"score":0.3594000041484833},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3564000129699707},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3269999921321869},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.31360000371932983},{"id":"https://openalex.org/C2986089797","wikidata":"https://www.wikidata.org/wiki/Q6501338","display_name":"Visual attention","level":3,"score":0.3100999891757965},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3041999936103821},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3003999888896942},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.2939000129699707},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.28110000491142273},{"id":"https://openalex.org/C2777946921","wikidata":"https://www.wikidata.org/wiki/Q7449044","display_name":"Semantic analysis (machine learning)","level":2,"score":0.2768999934196472},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.2766000032424927},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.26669999957084656},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26019999384880066},{"id":"https://openalex.org/C125308379","wikidata":"https://www.wikidata.org/wiki/Q363057","display_name":"Market segmentation","level":2,"score":0.25110000371932983}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpami.2021.3139957","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2021.3139957","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:34982673","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/34982673","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1285748160","display_name":null,"funder_award_id":"30920021135","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"},{"id":"https://openalex.org/G128715220","display_name":null,"funder_award_id":"BK20210327","funder_id":"https://openalex.org/F4320322769","funder_display_name":"Natural Science Foundation of Jiangsu Province"},{"id":"https://openalex.org/G1591232902","display_name":null,"funder_award_id":"62102182","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3220463999","display_name":null,"funder_award_id":"AISG-RP-2018-003","funder_id":"https://openalex.org/F4320320709","funder_display_name":"National Research Foundation Singapore"},{"id":"https://openalex.org/G6318443163","display_name":null,"funder_award_id":"61905114","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7000730516","display_name":null,"funder_award_id":"61976116","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320320709","display_name":"National Research Foundation Singapore","ror":"https://ror.org/03cpyc314"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322769","display_name":"Natural Science Foundation of Jiangsu Province","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":52,"referenced_works":["https://openalex.org/W1565402342","https://openalex.org/W1923184257","https://openalex.org/W1933349210","https://openalex.org/W1999874108","https://openalex.org/W2067912884","https://openalex.org/W2119717200","https://openalex.org/W2125215748","https://openalex.org/W2142192571","https://openalex.org/W2157331557","https://openalex.org/W2194775991","https://openalex.org/W2250539671","https://openalex.org/W2277195237","https://openalex.org/W2552900565","https://openalex.org/W2557465155","https://openalex.org/W2560474170","https://openalex.org/W2563705555","https://openalex.org/W2579549467","https://openalex.org/W2745461083","https://openalex.org/W2808877322","https://openalex.org/W2930283066","https://openalex.org/W2962684798","https://openalex.org/W2962749469","https://openalex.org/W2962779575","https://openalex.org/W2962789679","https://openalex.org/W2963445119","https://openalex.org/W2963656855","https://openalex.org/W2963800628","https://openalex.org/W2963954913","https://openalex.org/W2964067226","https://openalex.org/W2964118342","https://openalex.org/W2964303913","https://openalex.org/W2964339842","https://openalex.org/W2964935470","https://openalex.org/W2985321619","https://openalex.org/W2993182889","https://openalex.org/W2995933538","https://openalex.org/W3035086574","https://openalex.org/W6605121731","https://openalex.org/W6620707391","https://openalex.org/W6638088447","https://openalex.org/W6685520387","https://openalex.org/W6693561243","https://openalex.org/W6701498492","https://openalex.org/W6719057275","https://openalex.org/W6723762606","https://openalex.org/W6733387591","https://openalex.org/W6746518932","https://openalex.org/W6747912417","https://openalex.org/W6756121852","https://openalex.org/W6765766786","https://openalex.org/W6770438373","https://openalex.org/W6782298681"],"related_works":[],"abstract_inverted_index":{"Embodied":[0,91],"Question":[1,92,128],"Answering":[2,129],"(EQA)":[3],"is":[4,13,123,137],"a":[5,82,102,115,133],"newly":[6],"defined":[7],"research":[8,29],"area":[9],"where":[10],"an":[11],"agent":[12],"required":[14],"to":[15,32,57,65,139,169],"answer":[16],"the":[17,22,44,58,66,73,111,126,141,159,162],"user's":[18],"questions":[19],"by":[20,100,110],"exploring":[21],"real-world":[23],"environment.":[24,75],"It":[25],"has":[26],"attracted":[27],"increasing":[28],"interests":[30],"due":[31,56],"its":[33],"broad":[34],"applications":[35],"in":[36,49,177],"personal":[37],"assistants":[38],"and":[39,53,68,84,117,165,172,182],"in-home":[40],"robots.":[41],"Most":[42],"of":[43,51,60,72,161],"existing":[45],"methods":[46],"perform":[47],"poorly":[48],"terms":[50],"answering":[52],"navigation":[54,166],"accuracy":[55,179],"absence":[59],"fine-level":[61],"semantic":[62,98,113],"information,":[63],"stability":[64],"ambiguity,":[67],"3D":[69],"spatial":[70],"information":[71],"virtual":[74],"To":[76],"tackle":[77],"these":[78],"problems,":[79],"we":[80,95],"propose":[81],"depth":[83,116],"segmentation":[85,106,118],"based":[86,119],"visual":[87,120],"attention":[88,121],"mechanism":[89,122],"for":[90,125],"Answering.":[93],"First,":[94],"extract":[96],"local":[97],"features":[99],"introducing":[101],"novel":[103],"high-speed":[104],"video":[105],"framework.":[107],"Then":[108],"guided":[109],"extracted":[112],"features,":[114],"proposed":[124],"Visual":[127],"(VQA)":[130],"sub-task.":[131],"Further,":[132],"feature":[134],"fusion":[135],"strategy":[136],"designed":[138],"guide":[140],"navigator's":[142],"training":[143],"process":[144],"without":[145],"much":[146],"additional":[147],"computational":[148],"cost.":[149],"The":[150],"ablation":[151],"experiments":[152],"show":[153],"that":[154],"our":[155],"method":[156],"effectively":[157],"boosts":[158],"performance":[160],"VQA":[163],"module":[164],"module,":[167],"leading":[168],"4.9":[170],"%":[171,174],"5.6":[173],"overall":[175],"improvement":[176],"EQA":[178],"on":[180],"House3D":[181],"Matterport3D":[183],"datasets":[184],"respectively.":[185]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":4}],"updated_date":"2026-06-03T09:05:47.796612","created_date":"2022-01-25T00:00:00"}
