{"id":"https://openalex.org/W7133708039","doi":"https://doi.org/10.48550/arxiv.2603.03942","title":"Lightweight Visual Reasoning for Socially-Aware Robots","display_name":"Lightweight Visual Reasoning for Socially-Aware Robots","publication_year":2026,"publication_date":"2026-03-04","ids":{"openalex":"https://openalex.org/W7133708039","doi":"https://doi.org/10.48550/arxiv.2603.03942"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.03942","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5050015281","display_name":"Alessio Galatolo","orcid":"https://orcid.org/0000-0002-9289-4659"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Galatolo, Alessio","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088868324","display_name":"Ronald Cumbal","orcid":"https://orcid.org/0000-0003-4472-4732"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cumbal, Ronald","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067990606","display_name":"Alexandros Rouchitsas","orcid":"https://orcid.org/0000-0003-3503-4676"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rouchitsas, Alexandros","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128206518","display_name":"Katie Winkle","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Winkle, Katie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113354273","display_name":"Didem G\u00fcrd\u00fcr Broo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Broo, Didem G\u00fcrd\u00fcr","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128194556","display_name":"Ginevra Castellano","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Castellano, Ginevra","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9405999779701233,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9405999779701233,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.03359999880194664,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.004800000227987766,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.6743999719619751},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5945000052452087},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.527899980545044},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5199000239372253},{"id":"https://openalex.org/keywords/human\u2013robot-interaction","display_name":"Human\u2013robot interaction","score":0.365200012922287},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.35740000009536743}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7321000099182129},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.6743999719619751},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6513000130653381},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5945000052452087},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.527899980545044},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5199000239372253},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5045999884605408},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.49880000948905945},{"id":"https://openalex.org/C145460709","wikidata":"https://www.wikidata.org/wiki/Q859951","display_name":"Human\u2013robot interaction","level":3,"score":0.365200012922287},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.35740000009536743},{"id":"https://openalex.org/C19966478","wikidata":"https://www.wikidata.org/wiki/Q4810574","display_name":"Mobile robot","level":3,"score":0.3573000133037567},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.3440999984741211},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3068000078201294},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.29660001397132874},{"id":"https://openalex.org/C81074085","wikidata":"https://www.wikidata.org/wiki/Q366872","display_name":"Motion planning","level":3,"score":0.28200000524520874},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2621000111103058}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.03942","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.03942","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03942","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.03942","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.46125346422195435,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Robots":[0],"operating":[1],"in":[2,34,47,78,117],"shared":[3],"human":[4,26],"environments":[5],"must":[6,16],"not":[7],"only":[8],"navigate,":[9],"interact,":[10],"and":[11,19,23,38,74,126,148,160,172],"detect":[12],"their":[13],"surroundings,":[14],"they":[15,44],"also":[17],"interpret":[18],"respond":[20],"to":[21],"dynamic,":[22],"often":[24],"unpredictable,":[25],"behaviours.":[27],"Although":[28],"recent":[29],"advances":[30],"have":[31],"shown":[32],"promise":[33],"enhancing":[35],"robotic":[36],"perception":[37],"instruction-following":[39],"using":[40],"Vision-Language":[41],"Models":[42],"(VLMs),":[43],"remain":[45],"limited":[46],"addressing":[48],"the":[49,69,75,94,103,175],"complexities":[50],"of":[51],"multimodal":[52],"human-robot":[53],"interactions":[54],"(HRI).":[55],"Motivated":[56],"by":[57,141],"this":[58,110],"challenge,":[59],"we":[60],"introduce":[61],"a":[62,87,98,118],"lightweight":[63],"language-to-vision":[64],"feedback":[65],"module":[66,81],"that":[67,101,134],"closes":[68],"loop":[70],"between":[71],"an":[72],"LLM":[73],"vision":[76],"encoder":[77,95],"VLMs.":[79],"The":[80],"projects":[82],"image-token":[83],"hidden":[84],"states":[85],"through":[86],"gated":[88],"Multi-Layer":[89],"Perceptron":[90],"(MLP)":[91],"back":[92],"into":[93],"input,":[96],"prompting":[97],"second":[99],"pass":[100],"reinterprets":[102],"scene":[104,123],"under":[105],"text":[106],"context.":[107],"We":[108],"evaluate":[109],"approach":[111],"on":[112,174],"three":[113],"robotics-centred":[114],"tasks:":[115],"navigation":[116,167],"simulated":[119],"environment":[120],"(Habitat),":[121],"sequential":[122],"description":[124,146],"(Mementos-Robotics),":[125],"human-intention":[127],"recognition":[128],"(our":[129],"HRI":[130],"dataset).":[131],"Results":[132],"show":[133,165],"our":[135],"method":[136],"improves":[137],"Qwen":[138],"2.5":[139],"(7B)":[140],"$3.3\\%$":[142],"(less":[143],"distance),":[144],"$+0.057$":[145],"score,":[147],"$+2.93\\%$":[149],"accuracy,":[150],"with":[151],"less":[152],"than":[153],"$3\\%$":[154],"extra":[155],"parameters;":[156],"Gemma":[157],"3":[158],"(4B)":[159,164],"LLaVA":[161],"OV":[162],"1.5":[163],"mixed":[166],"results":[168],"but":[169],"gains":[170],"$+0.111,+0.055$":[171],"$+10.81\\%,+4.79\\%$":[173],"latter":[176],"two":[177],"tasks.":[178],"Code":[179],"is":[180],"available":[181],"at":[182],"https://github.com/alessioGalatolo/VLM-Reasoning-for-Robotics":[183]},"counts_by_year":[],"updated_date":"2026-07-01T08:55:40.977307","created_date":"2026-03-06T00:00:00"}
