{"id":"https://openalex.org/W4405786114","doi":"https://doi.org/10.1109/iros58592.2024.10801993","title":"ManipVQA: Injecting Robotic Affordance and Physically Grounded Information into Multi-Modal Large Language Models","display_name":"ManipVQA: Injecting Robotic Affordance and Physically Grounded Information into Multi-Modal Large Language Models","publication_year":2024,"publication_date":"2024-10-14","ids":{"openalex":"https://openalex.org/W4405786114","doi":"https://doi.org/10.1109/iros58592.2024.10801993"},"language":"en","primary_location":{"id":"doi:10.1109/iros58592.2024.10801993","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros58592.2024.10801993","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067080265","display_name":"Siyuan Huang","orcid":"https://orcid.org/0000-0003-1524-7148"},"institutions":[{"id":"https://openalex.org/I4210122302","display_name":"ShangHai JiAi Genetics & IVF Institute","ror":"https://ror.org/02rgbry52","country_code":"CN","type":"healthcare","lineage":["https://openalex.org/I4210122302"]},{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Siyuan Huang","raw_affiliation_strings":["Shanghai AI Laboratory"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Shanghai AI Laboratory","institution_ids":["https://openalex.org/I4210122302","https://openalex.org/I4391012619"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104261352","display_name":"Iaroslav Ponomarenko","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Iaroslav Ponomarenko","raw_affiliation_strings":["Peking University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Peking University","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101637347","display_name":"Zhengkai Jiang","orcid":"https://orcid.org/0000-0003-4064-994X"},"institutions":[{"id":"https://openalex.org/I9796191","display_name":"University College of Applied Science","ror":"https://ror.org/00f72x493","country_code":"PS","type":"education","lineage":["https://openalex.org/I9796191"]}],"countries":["PS"],"is_corresponding":false,"raw_author_name":"Zhengkai Jiang","raw_affiliation_strings":["UCAS"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"UCAS","institution_ids":["https://openalex.org/I9796191"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100634697","display_name":"Xiaoqi Li","orcid":"https://orcid.org/0000-0001-7462-6303"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoqi Li","raw_affiliation_strings":["Peking University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Peking University","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101600241","display_name":"Xiaobin Hu","orcid":"https://orcid.org/0000-0002-5886-4026"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaobin Hu","raw_affiliation_strings":["TUM"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"TUM","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100362398","display_name":"Peng Gao","orcid":"https://orcid.org/0000-0002-1290-6972"},"institutions":[{"id":"https://openalex.org/I4210122302","display_name":"ShangHai JiAi Genetics & IVF Institute","ror":"https://ror.org/02rgbry52","country_code":"CN","type":"healthcare","lineage":["https://openalex.org/I4210122302"]},{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Gao","raw_affiliation_strings":["Shanghai AI Laboratory"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Shanghai AI Laboratory","institution_ids":["https://openalex.org/I4210122302","https://openalex.org/I4391012619"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100732450","display_name":"Hongsheng Li","orcid":"https://orcid.org/0000-0002-2664-7975"},"institutions":[{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]},{"id":"https://openalex.org/I4210122302","display_name":"ShangHai JiAi Genetics & IVF Institute","ror":"https://ror.org/02rgbry52","country_code":"CN","type":"healthcare","lineage":["https://openalex.org/I4210122302"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongsheng Li","raw_affiliation_strings":["Shanghai AI Laboratory"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Shanghai AI Laboratory","institution_ids":["https://openalex.org/I4210122302","https://openalex.org/I4391012619"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100389347","display_name":"Hao Dong","orcid":"https://orcid.org/0000-0002-0132-0239"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Dong","raw_affiliation_strings":["Peking University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Peking University","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5067080265"],"corresponding_institution_ids":["https://openalex.org/I4210122302","https://openalex.org/I4391012619"],"apc_list":null,"apc_paid":null,"fwci":5.6884,"has_fulltext":false,"cited_by_count":18,"citation_normalized_percentile":{"value":0.96568262,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"7580","last_page":"7587"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9860000014305115,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9860000014305115,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9679999947547913,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/affordance","display_name":"Affordance","score":0.9421823024749756},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6950132846832275},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6347579956054688},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.5939439535140991},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.36926913261413574},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.334619402885437},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.19513371586799622},{"id":"https://openalex.org/keywords/materials-science","display_name":"Materials science","score":0.057453930377960205}],"concepts":[{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.9421823024749756},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6950132846832275},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6347579956054688},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5939439535140991},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36926913261413574},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.334619402885437},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.19513371586799622},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.057453930377960205},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iros58592.2024.10801993","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros58592.2024.10801993","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W1524405667","https://openalex.org/W2108598243","https://openalex.org/W2251512949","https://openalex.org/W3035624836","https://openalex.org/W3120441392","https://openalex.org/W4221153063","https://openalex.org/W4224912544","https://openalex.org/W4283785166","https://openalex.org/W4386065393","https://openalex.org/W4386071509","https://openalex.org/W4388720459","https://openalex.org/W4389666682","https://openalex.org/W4390871954","https://openalex.org/W4401416612","https://openalex.org/W4401417048","https://openalex.org/W4402727730","https://openalex.org/W4402916210","https://openalex.org/W6694395031","https://openalex.org/W6791353385","https://openalex.org/W6801810553","https://openalex.org/W6842594805","https://openalex.org/W6843759960","https://openalex.org/W6849177959","https://openalex.org/W6850503672","https://openalex.org/W6850625674","https://openalex.org/W6852796095","https://openalex.org/W6853702739","https://openalex.org/W6854262950","https://openalex.org/W6854738657","https://openalex.org/W6854929498","https://openalex.org/W6856224812","https://openalex.org/W6857151620","https://openalex.org/W6857178701","https://openalex.org/W6857660107","https://openalex.org/W6858376594","https://openalex.org/W6858380932","https://openalex.org/W6861675914","https://openalex.org/W6861787071"],"related_works":["https://openalex.org/W1972718289","https://openalex.org/W1791514435","https://openalex.org/W2346831895","https://openalex.org/W2248634132","https://openalex.org/W3049116993","https://openalex.org/W1541884709","https://openalex.org/W2589081601","https://openalex.org/W2226037301","https://openalex.org/W2417026147","https://openalex.org/W3089455568"],"abstract_inverted_index":{"While":[0],"the":[1,125,145,151,169],"integration":[2],"of":[3,35,92,100,129,172],"Multi-modal":[4],"Large":[5],"Language":[6],"Models":[7],"(MLLMs)":[8],"with":[9,72,124],"robotic":[10,107,154,160],"systems":[11],"has":[12],"significantly":[13],"improved":[14],"robots\u2019":[15],"ability":[16],"to":[17,32,105],"understand":[18],"and":[19,53,88,114,137,162,176],"execute":[20],"natural":[21],"language":[22],"instructions,":[23],"their":[24],"performance":[25,171],"in":[26,50,109,159],"manipulation":[27],"tasks":[28],"remains":[29],"limited":[30],"due":[31],"a":[33,66,76,89,97,133,139],"lack":[34],"robotics-specific":[36,122],"knowledge.":[37],"Conventional":[38],"MLLMs":[39,71],"are":[40,178],"typically":[41],"trained":[42],"on":[43],"generic":[44],"image-text":[45],"pairs,":[46],"leaving":[47],"them":[48],"deficient":[49],"understanding":[51,91,108],"affordances":[52],"physical":[54,93,115],"concepts":[55],"crucial":[56],"for":[57],"manipulation.":[58],"To":[59,118],"address":[60],"this":[61,121],"gap,":[62],"we":[63,131],"propose":[64],"ManipVQA,":[65],"novel":[67],"framework":[68],"that":[69],"infuses":[70],"manipulation-centric":[73],"knowledge":[74,123],"through":[75],"Visual":[77],"Question-Answering":[78],"(VQA)":[79],"format.":[80],"This":[81,142],"approach":[82],"encompasses":[83],"tool":[84,110],"detection,":[85,111],"affordance":[86,112],"recognition,":[87],"broader":[90],"concepts.":[94],"We":[95],"curated":[96],"diverse":[98],"dataset":[99,177],"images":[101],"depicting":[102],"interactive":[103],"objects,":[104],"challenge":[106],"prediction,":[113],"concept":[116],"comprehension.":[117],"effectively":[119],"integrate":[120],"inherent":[126],"vision-reasoning":[127,147],"capabilities":[128],"MLLMs,":[130],"leverage":[132],"unified":[134],"VQA":[135],"format":[136],"devise":[138],"fine-tuning":[140],"strategy.":[141],"strategy":[143],"preserves":[144],"original":[146],"abilities":[148],"while":[149],"incorporating":[150],"newly":[152],"acquired":[153],"insights.":[155],"Empirical":[156],"evaluations":[157],"conducted":[158],"simulators":[161],"across":[163],"various":[164],"vision":[165],"task":[166],"benchmarks":[167],"demonstrate":[168],"robust":[170],"ManipVQA.":[173],"The":[174],"code":[175],"publicly":[179],"available":[180],"at":[181],"https://github.com/SiyuanHuang95/ManipVQA.":[182]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":15}],"updated_date":"2026-05-21T09:19:25.381259","created_date":"2025-10-10T00:00:00"}
