{"id":"https://openalex.org/W4417087011","doi":"https://doi.org/10.48550/arxiv.2512.04847","title":"Language Models as Semantic Teachers: Post-Training Alignment for Medical Audio Understanding","display_name":"Language Models as Semantic Teachers: Post-Training Alignment for Medical Audio Understanding","publication_year":2025,"publication_date":"2025-12-04","ids":{"openalex":"https://openalex.org/W4417087011","doi":"https://doi.org/10.48550/arxiv.2512.04847"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2512.04847","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.04847","pdf_url":"https://arxiv.org/pdf/2512.04847","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2512.04847","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100427963","display_name":"Ting Wang","orcid":"https://orcid.org/0000-0002-6800-242X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Tsai-Ning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072659107","display_name":"Lin-Lin Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Lin-Lin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047639590","display_name":"Neil Zeghidour","orcid":"https://orcid.org/0000-0001-6896-3987"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeghidour, Neil","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5011960578","display_name":"Aaqib Saeed","orcid":"https://orcid.org/0000-0003-1473-0322"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saeed, Aaqib","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100427963"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12419","display_name":"Phonocardiography and Auscultation Techniques","score":0.7364000082015991,"subfield":{"id":"https://openalex.org/subfields/2740","display_name":"Pulmonary and Respiratory Medicine"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T12419","display_name":"Phonocardiography and Auscultation Techniques","score":0.7364000082015991,"subfield":{"id":"https://openalex.org/subfields/2740","display_name":"Pulmonary and Respiratory Medicine"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T12418","display_name":"Respiratory and Cough-Related Research","score":0.19920000433921814,"subfield":{"id":"https://openalex.org/subfields/2740","display_name":"Pulmonary and Respiratory Medicine"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11775","display_name":"COVID-19 diagnosis using AI","score":0.018400000408291817,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5099999904632568},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.47099998593330383},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.46399998664855957},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.4546999931335449},{"id":"https://openalex.org/keywords/boosting","display_name":"Boosting (machine learning)","score":0.4140999913215637},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.39160001277923584},{"id":"https://openalex.org/keywords/auscultation","display_name":"Auscultation","score":0.37869998812675476}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8335000276565552},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5564000010490417},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5224000215530396},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5099999904632568},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.47099998593330383},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.46399998664855957},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.4546999931335449},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.4140999913215637},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.39239999651908875},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.39160001277923584},{"id":"https://openalex.org/C2777324038","wikidata":"https://www.wikidata.org/wiki/Q779054","display_name":"Auscultation","level":2,"score":0.37869998812675476},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.3084000051021576},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.29670000076293945},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2865000069141388},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.27239999175071716},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.25940001010894775},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.2563999891281128}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2512.04847","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.04847","pdf_url":"https://arxiv.org/pdf/2512.04847","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2512.04847","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.04847","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2512.04847","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.04847","pdf_url":"https://arxiv.org/pdf/2512.04847","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5365867299","display_name":null,"funder_award_id":"Grant","funder_id":"https://openalex.org/F4320321800","funder_display_name":"Nederlandse Organisatie voor Wetenschappelijk Onderzoek"},{"id":"https://openalex.org/G5455436254","display_name":null,"funder_award_id":"Fellowship","funder_id":"https://openalex.org/F4320321800","funder_display_name":"Nederlandse Organisatie voor Wetenschappelijk Onderzoek"},{"id":"https://openalex.org/G6009445997","display_name":null,"funder_award_id":"unknown","funder_id":"https://openalex.org/F4320321800","funder_display_name":"Nederlandse Organisatie voor Wetenschappelijk Onderzoek"}],"funders":[{"id":"https://openalex.org/F4320321800","display_name":"Nederlandse Organisatie voor Wetenschappelijk Onderzoek","ror":"https://ror.org/04jsz6e67"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4417087011.pdf","grobid_xml":"https://content.openalex.org/works/W4417087011.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Pre-trained":[0],"audio":[1,49,89],"models":[2,80,169],"excel":[3],"at":[4,68],"detecting":[5],"acoustic":[6,168],"patterns":[7],"in":[8,24,182],"auscultation":[9],"sounds":[10],"but":[11],"often":[12],"fail":[13],"to":[14,81,141,157],"grasp":[15],"their":[16,20],"clinical":[17,93,112],"significance,":[18],"limiting":[19],"use":[21],"and":[22],"performance":[23],"diagnostic":[25,172],"tasks.":[26],"To":[27,65],"bridge":[28],"this":[29,163],"gap,":[30],"we":[31,70],"introduce":[32],"AcuLa":[33,119],"(Audio-Clinical":[34],"Understanding":[35],"via":[36],"Language":[37],"Alignment),":[38],"a":[39,55,62,72,99,104,175],"lightweight":[40],"post-training":[41],"framework":[42],"that":[43,108,162],"instills":[44],"semantic":[45],"understanding":[46,181],"into":[47,91,170],"any":[48],"encoder":[50],"by":[51,75],"aligning":[52],"it":[53],"with":[54,103],"medical":[56],"language":[57,79],"model,":[58],"which":[59],"acts":[60],"as":[61],"\"semantic":[63],"teacher.\"":[64],"enable":[66],"alignment":[67,96,165],"scale,":[69],"construct":[71],"large-scale":[73],"dataset":[74],"leveraging":[76],"off-the-shelf":[77],"large":[78],"translate":[82],"the":[83,109,133,145,153],"rich,":[84],"structured":[85],"metadata":[86],"accompanying":[87],"existing":[88],"recordings":[90],"coherent":[92],"reports.":[94],"Our":[95,159],"strategy":[97],"combines":[98],"representation-level":[100],"contrastive":[101],"objective":[102],"self-supervised":[105],"modeling,":[106],"ensuring":[107],"model":[110],"learns":[111],"semantics":[113],"while":[114],"preserving":[115],"fine-grained":[116],"temporal":[117],"cues.":[118],"achieves":[120],"state-of-the-art":[121],"results":[122],"across":[123],"18":[124],"diverse":[125],"cardio-respiratory":[126],"tasks":[127],"from":[128,139,155],"10":[129],"different":[130],"datasets,":[131],"improving":[132],"mean":[134],"AUROC":[135,154],"on":[136,144],"classification":[137],"benchmarks":[138],"0.68":[140],"0.79":[142],"and,":[143],"most":[146],"challenging":[147],"COVID-19":[148],"cough":[149],"detection":[150],"task,":[151],"boosting":[152],"0.55":[156],"0.89.":[158],"work":[160],"demonstrates":[161],"audio-language":[164],"transforms":[166],"purely":[167],"clinically-aware":[171],"tools,":[173],"establishing":[174],"novel":[176],"paradigm":[177],"for":[178],"enhancing":[179],"physiological":[180],"audio-based":[183],"health":[184],"monitoring.":[185]},"counts_by_year":[],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2025-12-06T00:00:00"}
