{"id":"https://openalex.org/W7126088754","doi":"https://doi.org/10.48550/arxiv.2601.20796","title":"Dissecting Multimodal In-Context Learning: Modality Asymmetries and Circuit Dynamics in modern Transformers","display_name":"Dissecting Multimodal In-Context Learning: Modality Asymmetries and Circuit Dynamics in modern Transformers","publication_year":2026,"publication_date":"2026-01-28","ids":{"openalex":"https://openalex.org/W7126088754","doi":"https://doi.org/10.48550/arxiv.2601.20796"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.20796","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124158830","display_name":"Yiran Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Huang, Yiran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103125177","display_name":"Karsten Roth","orcid":"https://orcid.org/0000-0003-1510-7217"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roth, Karsten","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124190104","display_name":"Quentin Bouniot","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bouniot, Quentin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124288213","display_name":"Wenjia Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Wenjia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5120644176","display_name":"Zeynep Akata","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Akata, Zeynep","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5124158830"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.16189999878406525,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.16189999878406525,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.15199999511241913,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.10429999977350235,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5893999934196472},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5859000086784363},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.4749000072479248},{"id":"https://openalex.org/keywords/multimodal-learning","display_name":"Multimodal learning","score":0.47209998965263367},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.4359000027179718},{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.3878999948501587}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6797999739646912},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5893999934196472},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5859000086784363},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5776000022888184},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.4749000072479248},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.47209998965263367},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.4359000027179718},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4066999852657318},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.3878999948501587},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.32749998569488525},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.31310001015663147},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.27480000257492065}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.20796","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.20796","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.20796","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.20796","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.6843786239624023,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Transformer-based":[0],"multimodal":[1,87,113,135,152],"large":[2],"language":[3],"models":[4],"often":[5],"exhibit":[6],"in-context":[7,27,133],"learning":[8,92],"(ICL)":[9],"abilities.":[10],"Motivated":[11],"by":[12,55],"this":[13,31],"phenomenon,":[14],"we":[15,70],"ask:":[16],"how":[17],"do":[18],"transformers":[19,38,156],"learn":[20],"to":[21,85,115],"associate":[22],"information":[23],"across":[24,142],"modalities":[25],"from":[26,99,131],"examples?":[28],"We":[29,53],"investigate":[30],"question":[32],"through":[33],"controlled":[34,160],"experiments":[35],"on":[36,40,96,124],"small":[37],"trained":[39],"synthetic":[41],"classification":[42],"tasks,":[43],"enabling":[44],"precise":[45],"manipulation":[46],"of":[47,59],"data":[48,79,98,105],"statistics":[49],"and":[50,138,157],"model":[51],"architecture.":[52],"begin":[54],"revisiting":[56],"core":[57],"principles":[58],"unimodal":[60],"ICL":[61,114,153],"in":[62,107,154],"modern":[63,155],"transformers.":[64],"While":[65],"several":[66],"prior":[67],"findings":[68,145],"replicate,":[69],"find":[71],"that":[72,120,128],"Rotary":[73],"Position":[74],"Embeddings":[75],"(RoPE)":[76],"increases":[77],"the":[78,86,108],"complexity":[80,106],"threshold":[81],"for":[82,112,150,162],"ICL.":[83],"Extending":[84],"setting":[88],"reveals":[89],"a":[90,100,147,159],"fundamental":[91],"asymmetry:":[93],"when":[94],"pretrained":[95],"high-diversity":[97],"primary":[101],"modality,":[102],"surprisingly":[103],"low":[104],"secondary":[109],"modality":[110],"suffices":[111],"emerge.":[116],"Mechanistic":[117],"analysis":[118],"shows":[119],"both":[121],"settings":[122],"rely":[123],"an":[125],"induction-style":[126],"mechanism":[127],"copies":[129],"labels":[130],"matching":[132],"exemplars;":[134],"training":[136],"refines":[137],"extends":[139],"these":[140],"circuits":[141],"modalities.":[143],"Our":[144],"provide":[146],"mechanistic":[148],"foundation":[149],"understanding":[151],"introduce":[158],"testbed":[161],"future":[163],"investigation.":[164]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-01-30T00:00:00"}
