{"id":"https://openalex.org/W7127374880","doi":"https://doi.org/10.48550/arxiv.2602.00701","title":"Cross-Modal Binary Attention: An Energy-Efficient Fusion Framework for Audio-Visual Learning","display_name":"Cross-Modal Binary Attention: An Energy-Efficient Fusion Framework for Audio-Visual Learning","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7127374880","doi":"https://doi.org/10.48550/arxiv.2602.00701"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.00701","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124879384","display_name":"Mohamed Saleh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saleh, Mohamed","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124900405","display_name":"Zahra Ahmadi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ahmadi, Zahra","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.5026000142097473,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.5026000142097473,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.1216999962925911,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12032","display_name":"Multisensory perception and integration","score":0.07859999686479568,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/fusion-mechanism","display_name":"Fusion mechanism","score":0.6955000162124634},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.6808000206947327},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6470999717712402},{"id":"https://openalex.org/keywords/concatenation","display_name":"Concatenation (mathematics)","score":0.5475999712944031},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5289999842643738},{"id":"https://openalex.org/keywords/binary-number","display_name":"Binary number","score":0.4293999969959259},{"id":"https://openalex.org/keywords/sensor-fusion","display_name":"Sensor fusion","score":0.39010000228881836},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.3529999852180481}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.722000002861023},{"id":"https://openalex.org/C173414695","wikidata":"https://www.wikidata.org/wiki/Q5510276","display_name":"Fusion mechanism","level":4,"score":0.6955000162124634},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.6808000206947327},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6470999717712402},{"id":"https://openalex.org/C87619178","wikidata":"https://www.wikidata.org/wiki/Q126002","display_name":"Concatenation (mathematics)","level":2,"score":0.5475999712944031},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5353999733924866},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5289999842643738},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.4293999969959259},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.39010000228881836},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.3529999852180481},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.3472000062465668},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34689998626708984},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.34279999136924744},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.31130000948905945},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.30410000681877136},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.30160000920295715},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.27559998631477356},{"id":"https://openalex.org/C2778971668","wikidata":"https://www.wikidata.org/wiki/Q5510284","display_name":"Fusion rules","level":4,"score":0.2743000090122223},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.27410000562667847},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.251800000667572}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.00701","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.00701","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.00701","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.00701","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.8526632189750671,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Effective":[0],"multimodal":[1,117,185,191],"fusion":[2,20,44,63,77,98,118,138,166,186,192,197],"requires":[3],"mechanisms":[4],"that":[5,37,50,65,124,199],"can":[6],"capture":[7,144],"complex":[8],"cross-modal":[9,30,55,62,86,107,202],"dependencies":[10],"while":[11,42,103,164],"remaining":[12],"computationally":[13],"scalable":[14,75,196],"for":[15,208],"real-world":[16,209],"deployment.":[17],"Existing":[18],"audio-visual":[19,175,210],"approaches":[21],"face":[22],"a":[23,60,121,195],"fundamental":[24],"trade-off:":[25],"attention-based":[26],"methods":[27],"effectively":[28],"model":[29],"relationships":[31],"but":[32],"incur":[33],"quadratic":[34],"computational":[35],"complexity":[36,69],"prevents":[38],"hierarchical,":[39],"multi-scale":[40,137],"architectures,":[41],"efficient":[43,71],"strategies":[45],"rely":[46],"on":[47,173],"simplistic":[48],"concatenation":[49],"fails":[51],"to":[52,89,99,143],"extract":[53,90],"complementary":[54,91],"information.":[56,108],"We":[57],"introduce":[58],"CMQKA,":[59,111],"novel":[61],"mechanism":[64,198],"achieves":[66,160],"linear":[67],"O(N)":[68],"through":[70,127],"binary":[72,156],"operations,":[73,158],"enabling":[74],"hierarchical":[76,122,201],"previously":[78],"infeasible":[79],"with":[80,106,120,154,204],"conventional":[81],"attention.":[82],"CMQKA":[83],"employs":[84],"bidirectional":[85],"Query-Key":[87],"attention":[88],"spatiotemporal":[92],"features":[93],"and":[94,132,148,168,180],"uses":[95],"learnable":[96],"residual":[97],"preserve":[100],"modality-specific":[101],"characteristics":[102],"enriching":[104],"representations":[105],"Building":[109],"upon":[110],"we":[112],"present":[113],"SNNergy,":[114],"an":[115],"energy-efficient":[116],"framework":[119,142,189],"architecture":[123],"processes":[125],"inputs":[126],"progressively":[128],"decreasing":[129],"spatial":[130],"resolutions":[131],"increasing":[133],"semantic":[134],"abstraction.":[135],"This":[136],"capability":[139],"allows":[140],"the":[141],"both":[145],"local":[146],"patterns":[147],"global":[149],"context":[150],"across":[151],"modalities.":[152],"Implemented":[153],"event-driven":[155],"spike":[157],"SNNergy":[159],"remarkable":[161],"energy":[162,206],"efficiency":[163,207],"maintaining":[165],"effectiveness":[167],"establishing":[169],"new":[170],"state-of-the-art":[171],"results":[172],"challenging":[174],"benchmarks,":[176],"including":[177],"CREMA-D,":[178],"AVE,":[179],"UrbanSound8K-AV,":[181],"significantly":[182],"outperforming":[183],"existing":[184],"baselines.":[187],"Our":[188],"advances":[190],"by":[193],"introducing":[194],"enables":[200],"integration":[203],"practical":[205],"intelligence":[211],"systems.":[212]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-04T00:00:00"}
