{"id":"https://openalex.org/W6910637661","doi":"https://doi.org/10.48550/arxiv.2505.04637","title":"Adaptive Token Boundaries: Integrating Human Chunking Mechanisms into Multimodal LLMs","display_name":"Adaptive Token Boundaries: Integrating Human Chunking Mechanisms into Multimodal LLMs","publication_year":2025,"publication_date":"2025-05-03","ids":{"openalex":"https://openalex.org/W6910637661","doi":"https://doi.org/10.48550/arxiv.2505.04637"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2505.04637","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2505.04637","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article-journal"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2505.04637","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yu, Dongxing","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yu, Dongxing","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9570000171661377,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9570000171661377,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10465","display_name":"Neurobiology of Language and Bilingualism","score":0.0052999998442828655,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.004699999932199717,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/chunking","display_name":"Chunking (psychology)","score":0.6360999941825867},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.546999990940094},{"id":"https://openalex.org/keywords/parallels","display_name":"Parallels","score":0.490200012922287},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.48080000281333923},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4537999927997589},{"id":"https://openalex.org/keywords/computational-model","display_name":"Computational model","score":0.4162999987602234},{"id":"https://openalex.org/keywords/connectionism","display_name":"Connectionism","score":0.40939998626708984},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.36570000648498535},{"id":"https://openalex.org/keywords/soar","display_name":"Soar","score":0.3610999882221222}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7264000177383423},{"id":"https://openalex.org/C203357204","wikidata":"https://www.wikidata.org/wiki/Q1089605","display_name":"Chunking (psychology)","level":2,"score":0.6360999941825867},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5491999983787537},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.546999990940094},{"id":"https://openalex.org/C2775922551","wikidata":"https://www.wikidata.org/wiki/Q7135033","display_name":"Parallels","level":2,"score":0.490200012922287},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.48080000281333923},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4537999927997589},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4174000024795532},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.4162999987602234},{"id":"https://openalex.org/C8521452","wikidata":"https://www.wikidata.org/wiki/Q203790","display_name":"Connectionism","level":3,"score":0.40939998626708984},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.36570000648498535},{"id":"https://openalex.org/C17305859","wikidata":"https://www.wikidata.org/wiki/Q382944","display_name":"Soar","level":2,"score":0.3610999882221222},{"id":"https://openalex.org/C87868495","wikidata":"https://www.wikidata.org/wiki/Q750843","display_name":"Information processing","level":2,"score":0.35120001435279846},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3431999981403351},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.33570000529289246},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3327000141143799},{"id":"https://openalex.org/C166052673","wikidata":"https://www.wikidata.org/wiki/Q83021","display_name":"Empirical evidence","level":2,"score":0.31850001215934753},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.31310001015663147},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.3098999857902527},{"id":"https://openalex.org/C161407221","wikidata":"https://www.wikidata.org/wiki/Q4382939","display_name":"Cognitive model","level":3,"score":0.3093999922275543},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.30559998750686646},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.3012999892234802},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2978000044822693},{"id":"https://openalex.org/C108154423","wikidata":"https://www.wikidata.org/wiki/Q1469792","display_name":"Salience (neuroscience)","level":2,"score":0.29280000925064087},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C20854674","wikidata":"https://www.wikidata.org/wiki/Q4386060","display_name":"Cognitive architecture","level":3,"score":0.27489998936653137},{"id":"https://openalex.org/C2993724205","wikidata":"https://www.wikidata.org/wiki/Q315","display_name":"Human language","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C51620047","wikidata":"https://www.wikidata.org/wiki/Q23399","display_name":"Memetics","level":2,"score":0.2619999945163727},{"id":"https://openalex.org/C142816647","wikidata":"https://www.wikidata.org/wiki/Q5573018","display_name":"Glyph (data visualization)","level":3,"score":0.2603999972343445},{"id":"https://openalex.org/C2777393815","wikidata":"https://www.wikidata.org/wiki/Q662991","display_name":"Danaus","level":3,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2505.04637","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2505.04637","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2505.04637","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2505.04637","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article-journal"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7271608710289001,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,12,50,106],"multimodal":[3,29],"large":[4],"language":[5],"models":[6,122],"(MLLMs)":[7],"have":[8],"demonstrated":[9],"remarkable":[10],"capabilities":[11],"processing":[13],"diverse":[14],"data":[15],"types,":[16],"yet":[17],"significant":[18,118],"disparities":[19],"persist":[20],"between":[21,41,155],"human":[22,42,56,84,156],"cognitive":[23,107],"processes":[24],"and":[25,46,102,142,158],"computational":[26],"approaches":[27],"to":[28,77,148],"information":[30,85],"integration.":[31],"This":[32],"research":[33],"presents":[34],"a":[35,89],"systematic":[36],"investigation":[37],"into":[38],"the":[39,79,149,153],"parallels":[40],"cross-modal":[43,94],"chunking":[44],"mechanisms":[45,104],"token":[47],"representation":[48],"methodologies":[49],"MLLMs.":[51],"Through":[52],"empirical":[53,163],"studies":[54],"comparing":[55],"performance":[57],"patterns":[58,141],"with":[59],"model":[60],"behaviors":[61],"across":[62],"visual-linguistic":[63],"tasks,":[64],"we":[65],"demonstrate":[66,112],"that":[67,96,113],"conventional":[68],"static":[69],"tokenization":[70,95],"schemes":[71],"fundamentally":[72],"constrain":[73],"current":[74],"models'":[75],"capacity":[76],"simulate":[78],"dynamic,":[80],"context-sensitive":[81],"nature":[82],"of":[83,152],"processing.":[86],"We":[87],"propose":[88],"novel":[90],"framework":[91],"for":[92,165],"dynamic":[93],"incorporates":[97],"adaptive":[98],"boundaries,":[99],"hierarchical":[100],"representations,":[101],"alignment":[103],"grounded":[105],"science":[108],"principles.":[109],"Quantitative":[110],"evaluations":[111],"our":[114],"approach":[115],"yields":[116],"statistically":[117],"improvements":[119],"over":[120],"state-of-the-art":[121],"on":[123,127,132],"benchmark":[124],"tasks":[125],"(+7.8%":[126],"Visual":[128],"Question":[129],"Answering,":[130],"+5.3%":[131],"Complex":[133],"Scene":[134],"Description)":[135],"while":[136,161],"exhibiting":[137],"more":[138,167],"human-aligned":[139],"error":[140],"attention":[143],"distributions.":[144],"These":[145],"findings":[146],"contribute":[147],"theoretical":[150],"understanding":[151],"relationship":[154],"cognition":[157],"artificial":[159],"intelligence,":[160],"providing":[162],"evidence":[164],"developing":[166],"cognitively":[168],"plausible":[169],"AI":[170],"systems.":[171]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
