{"id":"https://openalex.org/W7160896128","doi":"https://doi.org/10.48550/arxiv.2605.08526","title":"Skill-CMIB: Multimodal Agent Skill for Consistent Action via Conditional Multimodal Information Bottleneck","display_name":"Skill-CMIB: Multimodal Agent Skill for Consistent Action via Conditional Multimodal Information Bottleneck","publication_year":2026,"publication_date":"2026-05-08","ids":{"openalex":"https://openalex.org/W7160896128","doi":"https://doi.org/10.48550/arxiv.2605.08526"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.08526","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08526","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.08526","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135977260","display_name":"Zihan Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Zihan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135912183","display_name":"Junda Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Junda","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135943454","display_name":"Tong Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Tong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113392281","display_name":"Qianqi Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Qianqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135954580","display_name":"Rohan Surana","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Surana, Rohan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042584669","display_name":"Uttaran Bhattacharya","orcid":"https://orcid.org/0000-0003-2141-9276"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhattacharya, Uttaran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135912223","display_name":"Lina Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Lina","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135984667","display_name":"Xin Eric Wang","orcid":"https://orcid.org/0000-0003-3001-4676"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xin Eric","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135992211","display_name":"Julian McAuley","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"McAuley, Julian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.39500001072883606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.39500001072883606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11431","display_name":"Action Observation and Synchronization","score":0.1379999965429306,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.1071000024676323,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.803600013256073},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.6306999921798706},{"id":"https://openalex.org/keywords/information-bottleneck-method","display_name":"Information bottleneck method","score":0.5304999947547913},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.529699981212616},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.49459999799728394},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4884999990463257}],"concepts":[{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.803600013256073},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6970999836921692},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.6306999921798706},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5667999982833862},{"id":"https://openalex.org/C60008888","wikidata":"https://www.wikidata.org/wiki/Q6031013","display_name":"Information bottleneck method","level":3,"score":0.5304999947547913},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.529699981212616},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5174000263214111},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.49459999799728394},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4884999990463257},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.44670000672340393},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.40619999170303345},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3709000051021576},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33009999990463257},{"id":"https://openalex.org/C152565575","wikidata":"https://www.wikidata.org/wiki/Q1124538","display_name":"Conditional random field","level":2,"score":0.3228999972343445},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.3003000020980835},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2596000134944916}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.08526","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08526","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.08526","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08526","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.819550096988678}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"LLM-based":[1],"agents":[2],"excel":[3],"at":[4],"planning":[5],"and":[6,55,74,92,97,151,165,201,207],"executing":[7],"long":[8],"action":[9],"sequences,":[10],"their":[11],"execution":[12,231],"often":[13],"remains":[14,178],"inconsistent":[15],"across":[16,53],"trials,":[17],"limiting":[18],"reliability.":[19],"Consolidating":[20],"agent":[21],"consistency":[22],"requires":[23],"distilling":[24,161],"trial-error":[25],"trajectories":[26],"into":[27],"reusable":[28,66,226],"skills":[29,71,84,150,228],"that":[30,48,59,177,217,229],"preserve":[31],"task-relevant":[32],"invariants":[33,50],"while":[34,69,111],"discarding":[35],"trajectory-specific":[36],"noise.":[37,98],"However,":[38],"in":[39,78,175],"multimodal":[40,139,149,169,190,227],"settings,":[41],"the":[42,189,193],"key":[43],"challenge":[44],"is":[45],"not":[46],"only":[47,172],"useful":[49],"are":[51,72],"distributed":[52],"vision":[54],"language":[56],"information,":[57],"but":[58],"different":[60,63],"modalities":[61],"support":[62],"kinds":[64],"of":[65],"skill":[67,120,140,163],"content:":[68],"some":[70],"verbalizable":[73,119],"interpretable,":[75],"others":[76],"reside":[77],"perceptual":[79,87,124,208],"evidence":[80],"beyond":[81,180],"text.":[82,181],"Text-only":[83],"may":[85],"lose":[86],"cues,":[88],"whereas":[89],"storing":[90],"text":[91,194],"perception":[93,176],"naively":[94],"introduces":[95],"redundancy":[96,200],"Existing":[99],"inference-time":[100],"methods,":[101],"such":[102],"as":[103],"self-consistency,":[104],"improve":[105,230],"reliability":[106],"through":[107],"costly":[108],"multi-sample":[109,235],"decoding,":[110],"internalization":[112],"strategies":[113],"lack":[114],"a":[115,136,145,158,167,214],"way":[116],"to":[117,223],"separate":[118],"content":[121],"from":[122],"residual":[123,173],"information.":[125],"To":[126],"address":[127],"this,":[128],"we":[129],"introduce":[130],"Conditional":[131],"Multimodal":[132],"Information":[133],"Bottleneck":[134],"(CMIB),":[135],"method":[137],"for":[138],"construction.":[141],"CMIB":[142,186,212],"begins":[143],"with":[144,213],"joint":[146],"bottleneck":[147,160,170],"over":[148,205],"derives":[152],"an":[153],"exact":[154],"sequential":[155],"decomposition:":[156],"(1)":[157],"text-stage":[159],"interpretable":[162],"cards,":[164],"(2)":[166],"conditional":[168,220],"compressing":[171],"information":[174],"predictive":[179],"Unlike":[182],"naive":[183],"two-stream":[184],"formulations,":[185],"explicitly":[187],"conditions":[188],"latent":[191],"on":[192],"skill,":[195],"thus":[196],"structurally":[197],"reducing":[198],"cross-modal":[199],"enabling":[202],"independent":[203],"control":[204],"textual":[206],"compression.":[209],"We":[210],"instantiate":[211],"variational":[215],"objective":[216],"makes":[218],"its":[219],"decomposition":[221],"tractable":[222],"optimize,":[224],"yielding":[225],"stability":[232],"without":[233],"incurring":[234],"inference":[236],"overhead.":[237]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
