{"id":"https://openalex.org/W7131374974","doi":"https://doi.org/10.48550/arxiv.2602.19605","title":"CLCR: Cross-Level Semantic Collaborative Representation for Multimodal Learning","display_name":"CLCR: Cross-Level Semantic Collaborative Representation for Multimodal Learning","publication_year":2026,"publication_date":"2026-02-23","ids":{"openalex":"https://openalex.org/W7131374974","doi":"https://doi.org/10.48550/arxiv.2602.19605"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.19605","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19605","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.19605","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126832917","display_name":"Chunlei Meng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Meng, Chunlei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126671982","display_name":"Guanhong Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Guanhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126841733","display_name":"Rong Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Rong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126671312","display_name":"Runmin Jian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jian, Runmin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126784930","display_name":"Zhongxue Gan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gan, Zhongxue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123476931","display_name":"Chun Ouyang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ouyang, Chun","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5126832917"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.8664000034332275,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.8664000034332275,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.05290000140666962,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.026100000366568565,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.48069998621940613},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.46320000290870667},{"id":"https://openalex.org/keywords/subspace-topology","display_name":"Subspace topology","score":0.4300000071525574},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4032999873161316},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.3873000144958496},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.357699990272522},{"id":"https://openalex.org/keywords/hierarchy","display_name":"Hierarchy","score":0.34279999136924744},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.3411000072956085},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.3402000069618225}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8091999888420105},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.573199987411499},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.48069998621940613},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.46320000290870667},{"id":"https://openalex.org/C32834561","wikidata":"https://www.wikidata.org/wiki/Q660730","display_name":"Subspace topology","level":2,"score":0.4300000071525574},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4172999858856201},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4032999873161316},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3873000144958496},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.357699990272522},{"id":"https://openalex.org/C31170391","wikidata":"https://www.wikidata.org/wiki/Q188619","display_name":"Hierarchy","level":2,"score":0.34279999136924744},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3411000072956085},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3402000069618225},{"id":"https://openalex.org/C511149849","wikidata":"https://www.wikidata.org/wiki/Q7449051","display_name":"Semantic computing","level":3,"score":0.32670000195503235},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32510000467300415},{"id":"https://openalex.org/C2778493491","wikidata":"https://www.wikidata.org/wiki/Q7449072","display_name":"Semantic matching","level":3,"score":0.3174999952316284},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3125999867916107},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.30970001220703125},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3086000084877014},{"id":"https://openalex.org/C110903229","wikidata":"https://www.wikidata.org/wiki/Q7449064","display_name":"Semantic integration","level":4,"score":0.2922999858856201},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.2782999873161316},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.27219998836517334},{"id":"https://openalex.org/C12362212","wikidata":"https://www.wikidata.org/wiki/Q728435","display_name":"Linear subspace","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2637999951839447},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.2605000138282776},{"id":"https://openalex.org/C99221444","wikidata":"https://www.wikidata.org/wiki/Q1532069","display_name":"Private information retrieval","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.2531999945640564},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25209999084472656},{"id":"https://openalex.org/C25810664","wikidata":"https://www.wikidata.org/wiki/Q44325","display_name":"Ontology","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.19605","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19605","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.19605","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19605","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.4660956561565399,"id":"https://metadata.un.org/sdg/17","display_name":"Partnerships for the goals"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"learning":[1],"aims":[2],"to":[3,116,165,176,185],"capture":[4],"both":[5],"shared":[6,108,118,130,159,180],"and":[7,42,69,84,109,112,134,161,181,184,200,209],"private":[8,110,138,163,182],"information":[9,142],"from":[10,137],"multiple":[11],"modalities.":[12],"However,":[13],"existing":[14],"methods":[15],"that":[16,128,204],"project":[17],"all":[18],"modalities":[19],"into":[20,64,107],"a":[21,65,77,90,121,167],"single":[22],"latent":[23],"space":[24],"for":[25,73,93],"fusion":[26],"often":[27],"overlook":[28],"the":[29,117,145,158],"asynchronous,":[30],"multi-level":[31],"semantic":[32,40,67,78,151],"structure":[33],"of":[34,179],"multimodal":[35],"data.":[36],"This":[37,125],"oversight":[38],"induces":[39],"misalignment":[41],"error":[43],"propagation,":[44],"thereby":[45],"degrading":[46],"representation":[47],"quality.":[48],"To":[49,140],"address":[50],"this":[51],"issue,":[52],"we":[53],"propose":[54],"Cross-Level":[55],"Co-Representation":[56],"(CLCR),":[57],"which":[58],"explicitly":[59],"organizes":[60],"each":[61,98],"modality's":[62],"features":[63,86,106,183],"three-level":[66],"hierarchy":[68,79],"specifies":[70],"level-wise":[71],"constraints":[72],"cross-modal":[74,114],"interactions.":[75],"First,":[76],"encoder":[80],"aligns":[81],"shallow,":[82],"mid,":[83],"deep":[85],"across":[87,143,212],"modalities,":[88],"establishing":[89],"common":[91],"basis":[92],"interaction.":[94],"And":[95],"then,":[96],"at":[97],"level,":[99],"an":[100],"Intra-Level":[101],"Co-Exchange":[102],"Domain":[103,148],"(IntraCED)":[104],"factorizes":[105],"subspaces":[111],"restricts":[113],"attention":[115],"subspace":[119],"via":[120],"learnable":[122],"token":[123],"budget.":[124],"design":[126],"ensures":[127],"only":[129],"semantics":[131],"are":[132],"exchanged":[133],"prevents":[135],"leakage":[136],"channels.":[139],"integrate":[141],"levels,":[144],"Inter-Level":[146],"Co-Aggregation":[147],"(InterCAD)":[149],"synchronizes":[150],"scales":[152],"using":[153],"learned":[154],"anchors,":[155],"selectively":[156],"fuses":[157],"representations,":[160],"gates":[162],"cues":[164],"form":[166],"compact":[168],"task":[169],"representation.":[170],"We":[171],"further":[172],"introduce":[173],"regularization":[174],"terms":[175],"enforce":[177],"separation":[178],"minimize":[186],"cross-level":[187],"interference.":[188],"Experiments":[189],"on":[190],"six":[191],"benchmarks":[192],"spanning":[193],"emotion":[194],"recognition,":[195],"event":[196],"localization,":[197],"sentiment":[198],"analysis,":[199],"action":[201],"recognition":[202],"show":[203],"CLCR":[205],"achieves":[206],"strong":[207],"performance":[208],"generalizes":[210],"well":[211],"tasks.":[213]},"counts_by_year":[],"updated_date":"2026-02-26T06:34:08.959763","created_date":"2026-02-26T00:00:00"}
