{"id":"https://openalex.org/W4415524378","doi":"https://doi.org/10.1109/mlsp62443.2025.11204242","title":"Closing the Gap in Multimodal Medical Representation Alignment","display_name":"Closing the Gap in Multimodal Medical Representation Alignment","publication_year":2025,"publication_date":"2025-08-31","ids":{"openalex":"https://openalex.org/W4415524378","doi":"https://doi.org/10.1109/mlsp62443.2025.11204242"},"language":null,"primary_location":{"id":"doi:10.1109/mlsp62443.2025.11204242","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mlsp62443.2025.11204242","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 35th International Workshop on Machine Learning for Signal Processing (MLSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5003606065","display_name":"Eleonora Grassucci","orcid":"https://orcid.org/0000-0003-4626-4506"},"institutions":[{"id":"https://openalex.org/I861853513","display_name":"Sapienza University of Rome","ror":"https://ror.org/02be6w209","country_code":"IT","type":"education","lineage":["https://openalex.org/I861853513"]}],"countries":["IT"],"is_corresponding":true,"raw_author_name":"Eleonora Grassucci","raw_affiliation_strings":["Sapienza University of Rome,Dept. of Information Engineering, Electronics and Telecommunications,Italy"],"affiliations":[{"raw_affiliation_string":"Sapienza University of Rome,Dept. of Information Engineering, Electronics and Telecommunications,Italy","institution_ids":["https://openalex.org/I861853513"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095762359","display_name":"Giordano Cicchetti","orcid":null},"institutions":[{"id":"https://openalex.org/I861853513","display_name":"Sapienza University of Rome","ror":"https://ror.org/02be6w209","country_code":"IT","type":"education","lineage":["https://openalex.org/I861853513"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Giordano Cicchetti","raw_affiliation_strings":["Sapienza University of Rome,Dept. of Information Engineering, Electronics and Telecommunications,Italy"],"affiliations":[{"raw_affiliation_string":"Sapienza University of Rome,Dept. of Information Engineering, Electronics and Telecommunications,Italy","institution_ids":["https://openalex.org/I861853513"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5019647783","display_name":"Danilo Comminiello","orcid":"https://orcid.org/0000-0003-4067-4504"},"institutions":[{"id":"https://openalex.org/I861853513","display_name":"Sapienza University of Rome","ror":"https://ror.org/02be6w209","country_code":"IT","type":"education","lineage":["https://openalex.org/I861853513"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Danilo Comminiello","raw_affiliation_strings":["Sapienza University of Rome,Dept. of Information Engineering, Electronics and Telecommunications,Italy"],"affiliations":[{"raw_affiliation_string":"Sapienza University of Rome,Dept. of Information Engineering, Electronics and Telecommunications,Italy","institution_ids":["https://openalex.org/I861853513"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5003606065"],"corresponding_institution_ids":["https://openalex.org/I861853513"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.16050772,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.7836999893188477,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.7836999893188477,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.7267000079154968,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10759","display_name":"Translation Studies and Practices","score":0.7204999923706055,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closing","display_name":"Closing (real estate)","score":0.7534999847412109},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.6290000081062317},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5715000033378601},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5432000160217285},{"id":"https://openalex.org/keywords/de-facto","display_name":"De facto","score":0.5171999931335449},{"id":"https://openalex.org/keywords/semantic-gap","display_name":"Semantic gap","score":0.4747999906539917},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4690999984741211},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4474000036716461}],"concepts":[{"id":"https://openalex.org/C2778775528","wikidata":"https://www.wikidata.org/wiki/Q5135432","display_name":"Closing (real estate)","level":2,"score":0.7534999847412109},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7027999758720398},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.6290000081062317},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6044999957084656},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5715000033378601},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5432000160217285},{"id":"https://openalex.org/C2992317946","wikidata":"https://www.wikidata.org/wiki/Q712144","display_name":"De facto","level":2,"score":0.5171999931335449},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.4747999906539917},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4690999984741211},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4496999979019165},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4474000036716461},{"id":"https://openalex.org/C31601959","wikidata":"https://www.wikidata.org/wiki/Q931309","display_name":"Medical imaging","level":2,"score":0.41499999165534973},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.3991999924182892},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.35190001130104065},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29670000076293945},{"id":"https://openalex.org/C534262118","wikidata":"https://www.wikidata.org/wiki/Q177719","display_name":"Medical diagnosis","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.29190000891685486},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2897000014781952},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.28139999508857727},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2712000012397766},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.2676999866962433},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.25780001282691956},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.25519999861717224}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/mlsp62443.2025.11204242","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mlsp62443.2025.11204242","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 35th International Workshop on Machine Learning for Signal Processing (MLSP)","raw_type":"proceedings-article"},{"id":"pmh:doi:10.48550/arxiv.2602.20046","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.20046","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W2897980926","https://openalex.org/W4285606530","https://openalex.org/W4372266552","https://openalex.org/W4385573131","https://openalex.org/W4386071707","https://openalex.org/W4404725396"],"related_works":[],"abstract_inverted_index":{"In":[0,83],"multimodal":[1,76],"learning,":[2],"CLIP":[3],"has":[4,58],"emerged":[5],"as":[6,54,79],"the":[7,55,80,91,96],"de":[8],"facto":[9],"approach":[10],"for":[11,62],"mapping":[12],"different":[13],"modalities":[14],"into":[15],"a":[16,108],"shared":[17],"latent":[18,49],"space":[19],"by":[20],"bringing":[21],"semantically":[22,117],"similar":[23],"representations":[24,119],"closer":[25],"while":[26],"pushing":[27],"apart":[28],"dissimilar":[29],"ones.":[30],"However,":[31],"CLIPbased":[32],"contrastive":[33],"losses":[34],"exhibit":[35],"unintended":[36],"behaviors":[37],"that":[38,95,111,116],"negatively":[39],"impact":[40],"true":[41],"semantic":[42],"alignment,":[43,104],"leading":[44],"to":[45],"sparse":[46],"and":[47,65,71,105,135,141],"fragmented":[48],"spaces.":[50],"This":[51],"phenomenon,":[52],"known":[53],"modality":[56,97],"gap,":[57,114],"been":[59],"partially":[60],"mitigated":[61],"standard":[63],"text":[64],"image":[66,142],"pairs":[67],"but":[68],"remains":[69],"unknown":[70],"unresolved":[72],"in":[73,90,102],"more":[74,121],"complex":[75],"settings,":[77],"such":[78],"medical":[81,103],"domain.":[82],"this":[84,88,113],"work,":[85],"we":[86,106],"study":[87],"phenomenon":[89],"latter":[92],"case,":[93],"revealing":[94],"gap":[98],"is":[99],"present":[100],"also":[101],"propose":[107],"modality-agnostic":[109],"framework":[110],"closes":[112],"ensuring":[115],"related":[118],"are":[120],"aligned,":[122],"regardless":[123],"of":[124],"their":[125],"source":[126],"modality.":[127],"Our":[128],"method":[129],"enhances":[130],"alignment":[131],"between":[132],"radiology":[133],"images":[134],"clinical":[136],"text,":[137],"improving":[138],"cross-modal":[139],"retrieval":[140],"captioning.":[143]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-24T00:00:00"}
