{"id":"https://openalex.org/W4402979627","doi":"https://doi.org/10.1109/icme57554.2024.10687514","title":"Common Sense Language-Guided Exploration and Hierarchical Dense Perception for Instruction Following Embodied Agents","display_name":"Common Sense Language-Guided Exploration and Hierarchical Dense Perception for Instruction Following Embodied Agents","publication_year":2024,"publication_date":"2024-07-15","ids":{"openalex":"https://openalex.org/W4402979627","doi":"https://doi.org/10.1109/icme57554.2024.10687514"},"language":"en","primary_location":{"id":"doi:10.1109/icme57554.2024.10687514","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme57554.2024.10687514","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5104338694","display_name":"Yuanwen Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yuanwen Chen","raw_affiliation_strings":["Chinese Academy of Sciences,The State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,The State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102726620","display_name":"Xinyao Zhang","orcid":"https://orcid.org/0009-0006-9892-7999"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinyao Zhang","raw_affiliation_strings":["Chinese Academy of Sciences,The State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,The State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053853035","display_name":"Yaran Chen","orcid":"https://orcid.org/0000-0001-9356-0610"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yaran Chen","raw_affiliation_strings":["Chinese Academy of Sciences,The State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,The State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100624298","display_name":"Dongbin Zhao","orcid":"https://orcid.org/0000-0001-8218-9633"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dongbin Zhao","raw_affiliation_strings":["Chinese Academy of Sciences,The State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,The State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068919040","display_name":"Yunzhen Zhao","orcid":"https://orcid.org/0000-0002-0349-7257"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunzhen Zhao","raw_affiliation_strings":["Tencent Beijing Research"],"affiliations":[{"raw_affiliation_string":"Tencent Beijing Research","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100631152","display_name":"Zhe Zhao","orcid":"https://orcid.org/0000-0003-4189-3258"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhe Zhao","raw_affiliation_strings":["Tencent Beijing Research"],"affiliations":[{"raw_affiliation_string":"Tencent Beijing Research","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100703624","display_name":"Pengfei Hu","orcid":"https://orcid.org/0009-0000-4537-6288"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengfei Hu","raw_affiliation_strings":["Tencent Beijing Research"],"affiliations":[{"raw_affiliation_string":"Tencent Beijing Research","institution_ids":["https://openalex.org/I2250653659"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5104338694"],"corresponding_institution_ids":["https://openalex.org/I19820366"],"apc_list":null,"apc_paid":null,"fwci":1.395,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.81885335,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.5763999819755554,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.5763999819755554,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.928926944732666},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7115921974182129},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.66962730884552},{"id":"https://openalex.org/keywords/embodied-agent","display_name":"Embodied agent","score":0.5114856958389282},{"id":"https://openalex.org/keywords/sense","display_name":"Sense (electronics)","score":0.42550429701805115},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.397097647190094},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3460227847099304},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3385009169578552},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.33015477657318115},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.2157609760761261},{"id":"https://openalex.org/keywords/neuroscience","display_name":"Neuroscience","score":0.08965271711349487},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08720144629478455}],"concepts":[{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.928926944732666},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7115921974182129},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.66962730884552},{"id":"https://openalex.org/C103683099","wikidata":"https://www.wikidata.org/wiki/Q5370102","display_name":"Embodied agent","level":3,"score":0.5114856958389282},{"id":"https://openalex.org/C143141573","wikidata":"https://www.wikidata.org/wiki/Q7450971","display_name":"Sense (electronics)","level":2,"score":0.42550429701805115},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.397097647190094},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3460227847099304},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3385009169578552},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.33015477657318115},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2157609760761261},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.08965271711349487},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08720144629478455},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme57554.2024.10687514","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme57554.2024.10687514","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.75,"display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W2524771588","https://openalex.org/W2899024931","https://openalex.org/W2950697717","https://openalex.org/W2963800628","https://openalex.org/W3031478966","https://openalex.org/W3034758614","https://openalex.org/W3096390127","https://openalex.org/W3169924623","https://openalex.org/W4214520160","https://openalex.org/W4214700710","https://openalex.org/W4285306713","https://openalex.org/W4312436794","https://openalex.org/W4390873374","https://openalex.org/W6798219303","https://openalex.org/W6802203484","https://openalex.org/W6846853244","https://openalex.org/W6849793301","https://openalex.org/W6851800889","https://openalex.org/W6853219587","https://openalex.org/W6858299849"],"related_works":["https://openalex.org/W1487956045","https://openalex.org/W1601503673","https://openalex.org/W1592154258","https://openalex.org/W1527882169","https://openalex.org/W1596535966","https://openalex.org/W4388039923","https://openalex.org/W2096246921","https://openalex.org/W2145935766","https://openalex.org/W2136562935","https://openalex.org/W2513760693"],"abstract_inverted_index":{"Embodied":[0],"Instruction":[1],"Following":[2],"(EIF)":[3],"involves":[4],"the":[5,44,71,106,112,116,137,141,145],"task":[6],"of":[7,73,102,118,127],"locating":[8],"and":[9,28,64,97],"manipulating":[10],"objects":[11],"according":[12],"to":[13,25,69,93,136],"language":[14,67],"instructions.":[15],"Existing":[16],"methods":[17],"face":[18],"challenges":[19],"in":[20,43],"small":[21,40],"object":[22,41],"navigation":[23,42],"due":[24],"ineffective":[26],"exploration":[27,52,80],"imperfect":[29],"perception,":[30],"which":[31,89],"ultimately":[32],"affects":[33],"their":[34],"performance.":[35],"This":[36],"study":[37],"focuses":[38],"on":[39,111,131,144],"EIF":[45],"domain.":[46],"We":[47],"propose":[48,84],"Common":[49],"Sense":[50],"Language-guided":[51],"(CSL),":[53],"a":[54],"novel":[55],"approach":[56],"that":[57],"leverages":[58],"common-sense":[59],"knowledge":[60],"from":[61,66],"seen":[62],"scenes":[63,134],"information":[65],"instructions":[68],"infer":[70],"location":[72],"objects.":[74],"The":[75,100,120],"proposed":[76,121],"CSL":[77],"significantly":[78,104],"improves":[79,105],"efficiency.":[81],"Additionally,":[82],"we":[83],"Hierarchical":[85],"Dense":[86],"Perception":[87],"(HDP),":[88],"uses":[90],"hierarchical":[91],"features":[92],"perform":[94],"semantic":[95],"segmentation":[96],"depth":[98],"estimation.":[99],"use":[101],"HDP":[103],"agent\u2019s":[107],"perceptual":[108],"capabilities.":[109],"Experiments":[110],"ALFRED":[113],"benchmark":[114],"demonstrate":[115],"effectiveness":[117],"CSL-HDP.":[119],"CSL-HDP":[122],"achieves":[123],"an":[124],"absolute":[125],"improvement":[126],"9.29%":[128],"(18.45%":[129],"relative)":[130],"unseen":[132],"test":[133],"compared":[135],"previous":[138],"state-of-the-art,":[139],"securing":[140],"top":[142],"position":[143],"leaderboard.":[146],"Code":[147],"will":[148],"be":[149],"available":[150],"at":[151],"https://github.com/Cyuanwen/CSL-HDP.":[152]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1}],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-10-10T00:00:00"}
