{"id":"https://openalex.org/W7141127927","doi":"https://doi.org/10.48550/arxiv.2603.25460","title":"CLAR: CIF-Localized Alignment for Retrieval-Augmented Speech LLM-Based Contextual ASR","display_name":"CLAR: CIF-Localized Alignment for Retrieval-Augmented Speech LLM-Based Contextual ASR","publication_year":2026,"publication_date":"2026-03-26","ids":{"openalex":"https://openalex.org/W7141127927","doi":"https://doi.org/10.48550/arxiv.2603.25460"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.25460","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25460","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.25460","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113335993","display_name":"Shangkun Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Shangkun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130791579","display_name":"Huan Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Huan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130795331","display_name":"Wei Zou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zou, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5051629064","display_name":"Yunzhang Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yunzhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9067999720573425,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9067999720573425,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.03359999880194664,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.014700000174343586,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6599000096321106},{"id":"https://openalex.org/keywords/conjunction","display_name":"Conjunction (astronomy)","score":0.3815000057220459},{"id":"https://openalex.org/keywords/monotonic-function","display_name":"Monotonic function","score":0.32359999418258667},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.2937999963760376},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.27570000290870667}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6901000142097473},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6844000220298767},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6599000096321106},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4855000078678131},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3901999890804291},{"id":"https://openalex.org/C59656382","wikidata":"https://www.wikidata.org/wiki/Q191536","display_name":"Conjunction (astronomy)","level":2,"score":0.3815000057220459},{"id":"https://openalex.org/C72169020","wikidata":"https://www.wikidata.org/wiki/Q194404","display_name":"Monotonic function","level":2,"score":0.32359999418258667},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.2937999963760376},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.26759999990463257}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.25460","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25460","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.25460","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25460","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5700271129608154}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Speech":[0,101],"LLM-based":[1],"ASR":[2,125],"often":[3],"struggles":[4],"with":[5,74],"named":[6],"entities":[7],"and":[8,63,67,80,85,116,120],"long-tail":[9],"words":[10],"due":[11],"to":[12,47],"strong":[13,123],"internal":[14],"language-model":[15],"priors.":[16],"Retrieval-augmented":[17],"biasing":[18],"can":[19],"help,":[20],"but":[21],"its":[22],"effectiveness":[23],"depends":[24],"on":[25],"accurate":[26],"hotword":[27,114],"localization":[28],"in":[29],"full-utterance":[30],"speech":[31],"under":[32],"weak":[33],"supervision.":[34],"We":[35],"propose":[36],"CLAR,":[37],"a":[38,75,86],"dual-encoder":[39],"speech-text":[40],"retriever":[41,71],"that":[42,110],"uses":[43],"Continuous":[44],"Integrate-and-Fire":[45],"(CIF)":[46],"learn":[48],"monotonic":[49],"token-level":[50],"alignments":[51],"without":[52,105],"timestamps.":[53],"With":[54],"length-aware":[55],"localized":[56],"matching,":[57],"CLAR":[58,111],"anchors":[59],"short-entity":[60],"acoustic":[61],"cues":[62],"reduces":[64,117],"representation":[65],"dilution":[66],"attention":[68],"drift.":[69],"The":[70],"is":[72],"trained":[73],"multi-granularity":[76],"objective":[77],"combining":[78],"global":[79],"local":[81],"segment-level":[82],"contrastive":[83],"losses":[84],"CIF":[87],"quantity":[88],"constraint.":[89],"At":[90],"inference,":[91],"top-ranked":[92],"hotwords":[93],"are":[94],"injected":[95],"as":[96],"contextual":[97,124],"prompts":[98],"for":[99],"the":[100],"LLM,":[102],"improving":[103],"recognition":[104],"shallow":[106],"fusion.":[107],"Experiments":[108],"show":[109],"significantly":[112],"improves":[113],"retrieval":[115],"both":[118],"CER":[119],"B-WER":[121],"against":[122],"baselines.":[126]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-28T00:00:00"}
