{"id":"https://openalex.org/W7133341616","doi":"https://doi.org/10.48550/arxiv.2603.01471","title":"Reconstructing Content via Collaborative Attention to Improve Multimodal Embedding Quality","display_name":"Reconstructing Content via Collaborative Attention to Improve Multimodal Embedding Quality","publication_year":2026,"publication_date":"2026-03-02","ids":{"openalex":"https://openalex.org/W7133341616","doi":"https://doi.org/10.48550/arxiv.2603.01471"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.01471","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01471","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.01471","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5029727538","display_name":"Jiahan Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Jiahan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128004143","display_name":"Da Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Da","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034939611","display_name":"Hengran Zhang","orcid":"https://orcid.org/0009-0004-1144-1298"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Hengran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084642830","display_name":"Yinqiong Cai","orcid":"https://orcid.org/0000-0002-7869-8213"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Yinqiong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127909335","display_name":"Lixin Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Lixin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109736354","display_name":"Jiafeng Guo","orcid":"https://orcid.org/0000-0002-2793-3893"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Jiafeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127974735","display_name":"Daiting Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Daiting","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127876795","display_name":"Dawei Yin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Dawei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127978571","display_name":"Keping Bi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bi, Keping","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5029727538"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3824000060558319,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3824000060558319,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.22769999504089355,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.11299999803304672,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.8762999773025513},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5170000195503235},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.4406000077724457},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.3409000039100647},{"id":"https://openalex.org/keywords/multimodal-learning","display_name":"Multimodal learning","score":0.3197000026702881},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.29829999804496765}],"concepts":[{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.8762999773025513},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7355999946594238},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5353999733924866},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5170000195503235},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.4406000077724457},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3617999851703644},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.3409000039100647},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3379000127315521},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.3197000026702881},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.29829999804496765},{"id":"https://openalex.org/C2778152352","wikidata":"https://www.wikidata.org/wiki/Q5165061","display_name":"Content (measure theory)","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2913999855518341},{"id":"https://openalex.org/C2779136372","wikidata":"https://www.wikidata.org/wiki/Q10283002","display_name":"Information flow","level":2,"score":0.28999999165534973},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2799000144004822},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2529999911785126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.01471","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01471","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.01471","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01471","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"embedding":[1,46,76,95,156,177],"models,":[2],"rooted":[3],"in":[4],"multimodal":[5,75,94,122,176],"large":[6],"language":[7],"models":[8,178],"(MLLMs),":[9],"have":[10],"yielded":[11],"significant":[12],"performance":[13,186],"improvements":[14],"across":[15],"diverse":[16],"tasks":[17],"such":[18],"as":[19,74,164],"retrieval":[20],"and":[21,40,55,103,152,181],"classification.":[22],"However,":[23],"most":[24],"existing":[25,173],"approaches":[26],"rely":[27],"heavily":[28],"on":[29,90,144],"large-scale":[30],"contrastive":[31,140],"learning,":[32],"with":[33],"limited":[34],"exploration":[35],"of":[36,43,59,67,129,172],"how":[37],"the":[38,52,65,100,110,116,121,126,130,133,136,170],"architectural":[39],"training":[41],"paradigms":[42],"MLLMs":[44,60],"affect":[45],"quality.":[47,157],"While":[48],"effective":[49,166],"for":[50,93,138],"generation,":[51],"causal":[53],"attention":[54,101],"next-token":[56],"prediction":[57],"paradigm":[58,88],"does":[61],"not":[62],"explicitly":[63],"encourage":[64],"formation":[66],"globally":[68],"compact":[69,180],"representations,":[70,183],"limiting":[71],"their":[72,185],"effectiveness":[73],"backbones.":[77],"To":[78],"address":[79],"this,":[80],"we":[81,98],"propose":[82],"CoCoA,":[83],"a":[84],"Content":[85],"reconstruction":[86,107,162],"pre-training":[87],"based":[89],"Collaborative":[91],"Attention":[92],"optimization.":[96],"Specifically,":[97],"restructure":[99],"flow":[102],"introduce":[104],"an":[105,165],"EOS-based":[106],"task,":[108],"encouraging":[109],"model":[111,123],"to":[112,124,168],"reconstruct":[113],"input":[114,131],"from":[115],"corresponding":[117],"embeddings.":[118],"This":[119],"drives":[120],"compress":[125],"semantic":[127],"information":[128],"into":[132],"token,":[134],"laying":[135],"foundations":[137],"subsequent":[139],"learning.":[141],"Extensive":[142],"experiments":[143],"MMEB-V1":[145],"demonstrate":[146],"that":[147,160],"CoCoA":[148],"built":[149],"upon":[150],"Qwen2-VL":[151],"Qwen2.5-VL":[153],"significantly":[154],"improves":[155],"Results":[158],"validate":[159],"content":[161],"serves":[163],"strategy":[167],"maximize":[169],"value":[171],"data,":[174],"enabling":[175],"generate":[179],"informative":[182],"raising":[184],"ceiling.":[187]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
