{"id":"https://openalex.org/W4402592239","doi":"https://doi.org/10.1109/tip.2024.3459800","title":"To Boost Zero-Shot Generalization for Embodied Reasoning With Vision-Language Pre-Training","display_name":"To Boost Zero-Shot Generalization for Embodied Reasoning With Vision-Language Pre-Training","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4402592239","doi":"https://doi.org/10.1109/tip.2024.3459800","pmid":"https://pubmed.ncbi.nlm.nih.gov/39292596"},"language":"en","primary_location":{"id":"doi:10.1109/tip.2024.3459800","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tip.2024.3459800","pdf_url":null,"source":{"id":"https://openalex.org/S4210173141","display_name":"IEEE Transactions on Image Processing","issn_l":"1057-7149","issn":["1057-7149","1941-0042"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Image Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101514073","display_name":"Ke Su","orcid":"https://orcid.org/0000-0001-8110-9486"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ke Su","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua-Bosch Joint ML Center, THBI Laboratory, BNRist Center, Institute for AI, Tsinghua University, Beijing, China","Dept. of Comp. Sci. and Tech, Institute for AI, Tsinghua-Bosch Joint ML Center, THBI Lab; BNRist Center, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua-Bosch Joint ML Center, THBI Laboratory, BNRist Center, Institute for AI, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Dept. of Comp. Sci. and Tech, Institute for AI, Tsinghua-Bosch Joint ML Center, THBI Lab; BNRist Center, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100343569","display_name":"Xingxing Zhang","orcid":"https://orcid.org/0000-0003-4012-3796"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingxing Zhang","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua-Bosch Joint ML Center, THBI Laboratory, BNRist Center, Institute for AI, Tsinghua University, Beijing, China","Dept. of Comp. Sci. and Tech, Institute for AI, Tsinghua-Bosch Joint ML Center, THBI Lab; BNRist Center, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua-Bosch Joint ML Center, THBI Laboratory, BNRist Center, Institute for AI, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Dept. of Comp. Sci. and Tech, Institute for AI, Tsinghua-Bosch Joint ML Center, THBI Lab; BNRist Center, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115592949","display_name":"Siyang Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I205237279","display_name":"Nankai University","ror":"https://ror.org/01y1kjr75","country_code":"CN","type":"education","lineage":["https://openalex.org/I205237279"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Siyang Zhang","raw_affiliation_strings":["School of Artificial Intelligence, Nankai University, Tianjin, China","Nankai University, Tianjing, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Nankai University, Tianjin, China","institution_ids":["https://openalex.org/I205237279"]},{"raw_affiliation_string":"Nankai University, Tianjing, China","institution_ids":["https://openalex.org/I205237279"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100606995","display_name":"Jun Zhu","orcid":"https://orcid.org/0000-0002-6254-2388"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Zhu","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua-Bosch Joint ML Center, THBI Laboratory, BNRist Center, Institute for AI, Tsinghua University, Beijing, China","Dept. of Comp. Sci. and Tech, Institute for AI, Tsinghua-Bosch Joint ML Center, THBI Lab; BNRist Center, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua-Bosch Joint ML Center, THBI Laboratory, BNRist Center, Institute for AI, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Dept. of Comp. Sci. and Tech, Institute for AI, Tsinghua-Bosch Joint ML Center, THBI Lab; BNRist Center, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100744420","display_name":"Bo Zhang","orcid":"https://orcid.org/0000-0002-0302-2550"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Zhang","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua-Bosch Joint ML Center, THBI Laboratory, BNRist Center, Institute for AI, Tsinghua University, Beijing, China","Dept. of Comp. Sci. and Tech, Institute for AI, Tsinghua-Bosch Joint ML Center, THBI Lab; BNRist Center, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua-Bosch Joint ML Center, THBI Laboratory, BNRist Center, Institute for AI, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Dept. of Comp. Sci. and Tech, Institute for AI, Tsinghua-Bosch Joint ML Center, THBI Lab; BNRist Center, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101514073"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":1.2249,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.80660717,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":"33","issue":null,"first_page":"5370","last_page":"5381"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.953499972820282,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9144999980926514,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.6546541452407837},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.6531476378440857},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.637431800365448},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6020711660385132},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5915653705596924},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5392268300056458},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5303353071212769},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.5265971422195435},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3793215751647949},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.32522833347320557},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.3018813729286194},{"id":"https://openalex.org/keywords/mathematical-analysis","display_name":"Mathematical analysis","score":0.1271878480911255},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.10460224747657776}],"concepts":[{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.6546541452407837},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.6531476378440857},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.637431800365448},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6020711660385132},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5915653705596924},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5392268300056458},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5303353071212769},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.5265971422195435},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3793215751647949},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32522833347320557},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3018813729286194},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.1271878480911255},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.10460224747657776},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tip.2024.3459800","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tip.2024.3459800","pdf_url":null,"source":{"id":"https://openalex.org/S4210173141","display_name":"IEEE Transactions on Image Processing","issn_l":"1057-7149","issn":["1057-7149","1941-0042"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Image Processing","raw_type":"journal-article"},{"id":"pmid:39292596","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/39292596","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on image processing : a publication of the IEEE Signal Processing Society","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4300000071525574,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G6269517161","display_name":null,"funder_award_id":"BNR2023RC01004","funder_id":"https://openalex.org/F4320329777","funder_display_name":"Beijing National Research Center For Information Science And Technology"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320329777","display_name":"Beijing National Research Center For Information Science And Technology","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":78,"referenced_works":["https://openalex.org/W1933349210","https://openalex.org/W2116341502","https://openalex.org/W2131953535","https://openalex.org/W2277195237","https://openalex.org/W2529436507","https://openalex.org/W2560730294","https://openalex.org/W2561715562","https://openalex.org/W2741631785","https://openalex.org/W2745461083","https://openalex.org/W2747623286","https://openalex.org/W2760103357","https://openalex.org/W2793546384","https://openalex.org/W2883104598","https://openalex.org/W2904910963","https://openalex.org/W2926977875","https://openalex.org/W2930283066","https://openalex.org/W2950697717","https://openalex.org/W2962716332","https://openalex.org/W2962779575","https://openalex.org/W2963224792","https://openalex.org/W2963383024","https://openalex.org/W2963717374","https://openalex.org/W2964118342","https://openalex.org/W2964138343","https://openalex.org/W2964339842","https://openalex.org/W2969679616","https://openalex.org/W2981851019","https://openalex.org/W2998339148","https://openalex.org/W2998631105","https://openalex.org/W2998903229","https://openalex.org/W3004349648","https://openalex.org/W3009928773","https://openalex.org/W3016039242","https://openalex.org/W3029795912","https://openalex.org/W3035520037","https://openalex.org/W3134025030","https://openalex.org/W3152893301","https://openalex.org/W3172675210","https://openalex.org/W3172845486","https://openalex.org/W3194633557","https://openalex.org/W3195541832","https://openalex.org/W3211462570","https://openalex.org/W4207072548","https://openalex.org/W4312254032","https://openalex.org/W4312645695","https://openalex.org/W4322764098","https://openalex.org/W4379929801","https://openalex.org/W6634232107","https://openalex.org/W6676587327","https://openalex.org/W6716358881","https://openalex.org/W6736685754","https://openalex.org/W6738893770","https://openalex.org/W6745537798","https://openalex.org/W6747106673","https://openalex.org/W6747912417","https://openalex.org/W6748270630","https://openalex.org/W6751796012","https://openalex.org/W6752083267","https://openalex.org/W6753516098","https://openalex.org/W6754725917","https://openalex.org/W6755207826","https://openalex.org/W6757902542","https://openalex.org/W6768257583","https://openalex.org/W6770070887","https://openalex.org/W6773810110","https://openalex.org/W6780471797","https://openalex.org/W6786247770","https://openalex.org/W6791353385","https://openalex.org/W6797065961","https://openalex.org/W6799405210","https://openalex.org/W6800673378","https://openalex.org/W6809190652","https://openalex.org/W6811461410","https://openalex.org/W6842667729","https://openalex.org/W6843254161","https://openalex.org/W6846556436","https://openalex.org/W6848494730","https://openalex.org/W7066730619"],"related_works":["https://openalex.org/W2380179524","https://openalex.org/W4283365723","https://openalex.org/W2963001125","https://openalex.org/W2091233881","https://openalex.org/W2352366064","https://openalex.org/W4250820896","https://openalex.org/W2124102101","https://openalex.org/W4250305970","https://openalex.org/W1484550171","https://openalex.org/W2333383158"],"abstract_inverted_index":{"Recently,":[0],"there":[1],"exists":[2],"an":[3,14],"increased":[4,45],"research":[5],"interest":[6],"in":[7,50,95,101,104,149],"embodied":[8,75,115,171],"artificial":[9],"intelligence":[10],"(EAI),":[11],"which":[12],"involves":[13],"agent":[15],"learning":[16],"to":[17,43,62,69,84,87,110],"perform":[18],"a":[19,32,135,150,167,175,194],"specific":[20,136],"task":[21,86],"when":[22],"dynamically":[23],"interacting":[24],"with":[25,57,77,117,193],"the":[26,44,85,140,154,204],"surrounding":[27],"3D":[28,51],"environment.":[29],"There":[30],"into,":[31],"new":[33,63,198,205],"challenge":[34],"is":[35,92,157],"that":[36,120,187],"many":[37,190],"unseen":[38],"objects":[39,64,199],"may":[40],"appear":[41],"due":[42],"number":[46,196],"of":[47,169,189,197],"object":[48],"categories":[49],"scenes.":[52],"It":[53],"makes":[54],"developing":[55],"models":[56,103],"strong":[58],"zero-shot":[59,112],"generalization":[60,113],"ability":[61],"necessary.":[65],"Existing":[66],"work":[67],"tries":[68],"achieve":[70,203],"this":[71],"goal":[72],"by":[73,98,174],"providing":[74],"agents":[76],"massive":[78],"high-quality":[79],"human":[80],"annotations":[81],"closely":[82],"related":[83],"be":[88],"learned,":[89],"while":[90],"it":[91],"too":[93],"costly":[94],"practice.":[96],"Inspired":[97],"recent":[99],"advances":[100],"pre-trained":[102,141],"2D":[105],"visual":[106],"tasks,":[107],"we":[108,138],"attempt":[109],"boost":[111],"for":[114],"reasoning":[116,172],"vision-language":[118],"pre-training":[119],"can":[121,165],"encode":[122],"common":[123],"sense":[124],"as":[125],"general":[126],"prior":[127],"knowledge.":[128],"To":[129],"further":[130],"improve":[131,166],"its":[132],"performance":[133],"on":[134,184],"task,":[137],"rectify":[139],"representation":[142],"through":[143],"masked":[144],"scene":[145],"graph":[146],"modeling":[147],"(MSGM)":[148],"self-supervised":[151],"manner,":[152],"where":[153],"task-specific":[155],"knowledge":[156],"learned":[158],"from":[159],"iterative":[160],"message":[161],"passing.":[162],"Our":[163],"method":[164],"variety":[168],"representative":[170],"tasks":[173],"large":[176,195],"margin":[177],"(e.g.,":[178],"over":[179],"5.0%":[180],"w.r.t.":[181],"answer":[182],"accuracy":[183],"MP3D-EQA":[185],"dataset":[186],"consists":[188],"real-world":[191],"scenes":[192],"during":[200],"testing),":[201],"and":[202],"state-of-the-art":[206],"performance.":[207]},"counts_by_year":[{"year":2025,"cited_by_count":5}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
