{"id":"https://openalex.org/W4416071347","doi":"https://doi.org/10.48550/arxiv.2506.03194","title":"HueManity: Probing Fine-Grained Visual Perception in MLLMs","display_name":"HueManity: Probing Fine-Grained Visual Perception in MLLMs","publication_year":2025,"publication_date":"2025-05-31","ids":{"openalex":"https://openalex.org/W4416071347","doi":"https://doi.org/10.48550/arxiv.2506.03194"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2506.03194","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.03194","pdf_url":"https://arxiv.org/pdf/2506.03194","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2506.03194","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5119980918","display_name":"Rynaa Grover","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Grover, Rynaa","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092557153","display_name":"Jayant Sravan Tamarapalli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tamarapalli, Jayant Sravan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092557154","display_name":"Sahiti Yerramilli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yerramilli, Sahiti","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5119980917","display_name":"Nilay Pande","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pande, Nilay","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5119980918"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9884999990463257,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9884999990463257,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.0019000000320374966,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.0010000000474974513,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/alphanumeric","display_name":"Alphanumeric","score":0.8432000279426575},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.6256999969482422},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.5055999755859375},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.49219998717308044},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.40389999747276306},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.3531999886035919},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.3402999937534332},{"id":"https://openalex.org/keywords/psychophysics","display_name":"Psychophysics","score":0.3249000012874603}],"concepts":[{"id":"https://openalex.org/C2781003394","wikidata":"https://www.wikidata.org/wiki/Q737372","display_name":"Alphanumeric","level":2,"score":0.8432000279426575},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6539999842643738},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.6256999969482422},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5568000078201294},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.5055999755859375},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.49219998717308044},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.40389999747276306},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3644999861717224},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3531999886035919},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.3402999937534332},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.33000001311302185},{"id":"https://openalex.org/C15123163","wikidata":"https://www.wikidata.org/wiki/Q500096","display_name":"Psychophysics","level":3,"score":0.3249000012874603},{"id":"https://openalex.org/C2779332521","wikidata":"https://www.wikidata.org/wiki/Q1820694","display_name":"Legibility","level":2,"score":0.32089999318122864},{"id":"https://openalex.org/C158495155","wikidata":"https://www.wikidata.org/wiki/Q2369151","display_name":"Visual search","level":2,"score":0.3077999949455261},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.29750001430511475},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.29109999537467957},{"id":"https://openalex.org/C164280684","wikidata":"https://www.wikidata.org/wiki/Q5529040","display_name":"Gaze-contingency paradigm","level":4,"score":0.2831000089645386},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.27900001406669617},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2718999981880188},{"id":"https://openalex.org/C185761835","wikidata":"https://www.wikidata.org/wiki/Q1431287","display_name":"Peripheral vision","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2554999887943268},{"id":"https://openalex.org/C174478892","wikidata":"https://www.wikidata.org/wiki/Q4747455","display_name":"Amodal perception","level":3,"score":0.25360000133514404},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.25119999051094055}],"mesh":[],"locations_count":3,"locations":[{"id":"pmh:oai:arXiv.org:2506.03194","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.03194","pdf_url":"https://arxiv.org/pdf/2506.03194","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:doi:10.48550/arxiv.2506.03194","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2506.03194","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.03194","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2506.03194","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.03194","pdf_url":"https://arxiv.org/pdf/2506.03194","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"Multimodal":[1],"Large":[2],"Language":[3],"Models":[4],"(MLLMs)":[5],"demonstrate":[6],"strong":[7],"high-level":[8,143],"visual":[9,15,58,79],"reasoning":[10],"on":[11,99,106],"tasks":[12],"such":[13],"as":[14],"question":[16],"answering":[17],"and":[18,40,104,119],"image":[19],"captioning.":[20],"Yet":[21],"existing":[22],"benchmarks":[23,141],"largely":[24],"overlook":[25],"their":[26],"ability":[27],"to":[28,71,112],"capture":[29],"fine-grained":[30,57],"perceptual":[31,44,133],"details.":[32],"As":[33],"MLLMs":[34,86],"are":[35],"increasingly":[36],"deployed":[37],"in":[38,60,131],"safety":[39],"reliability":[41],"critical":[42,129],"settings,":[43],"acuity":[45],"becomes":[46],"essential.":[47],"We":[48],"present":[49],"HueManity,":[50],"a":[51,75,88,100,107,120,128],"scalable":[52],"automated":[53],"benchmark":[54],"for":[55],"assessing":[56],"perception":[59],"MLLMs.":[61],"HueManity":[62],"comprises":[63],"83,850":[64],"Ishihara-style":[65],"images":[66],"embedding":[67],"alphanumeric":[68,109],"strings,":[69],"designed":[70],"evaluate":[72],"pattern":[73],"recognition,":[74],"core":[76],"aspect":[77],"of":[78,83],"understanding.":[80],"Our":[81],"evaluation":[82],"nine":[84],"state-of-the-art":[85],"uncovers":[87],"striking":[89],"performance":[90,114],"deficit:":[91],"the":[92],"strongest":[93],"model":[94],"achieved":[95],"only":[96],"33.6%":[97],"accuracy":[98],"simple":[101],"numeric":[102],"task":[103],"3%":[105],"harder":[108],"task,":[110],"compared":[111],"near-ceiling":[113],"from":[115],"humans":[116],"(99.38%,":[117],"93.25%)":[118],"fine-tuned":[121],"ResNet-50":[122],"(96.5%,":[123],"94.5%).":[124],"These":[125],"findings":[126],"expose":[127],"weakness":[130],"MLLMs'":[132],"grounding,":[134],"one":[135],"that":[136],"remains":[137],"obscured":[138],"by":[139],"conventional":[140],"emphasizing":[142],"semantics.":[144]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
