{"id":"https://openalex.org/W4412944625","doi":"https://doi.org/10.18653/v1/2025.findings-acl.1281","title":"Let\u2019s Fuse Step by Step: A Generative Fusion Decoding Algorithm with LLMs for Robust and Instruction-Aware ASR and OCR","display_name":"Let\u2019s Fuse Step by Step: A Generative Fusion Decoding Algorithm with LLMs for Robust and Instruction-Aware ASR and OCR","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412944625","doi":"https://doi.org/10.18653/v1/2025.findings-acl.1281"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.findings-acl.1281","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.1281","pdf_url":"https://aclanthology.org/2025.findings-acl.1281.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.findings-acl.1281.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059902893","display_name":"Chan-Jan Hsu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chan-Jan Hsu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101438450","display_name":"Yi\u2010Chang Chen","orcid":"https://orcid.org/0000-0002-1841-598X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi-Chang Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067740015","display_name":"Feng-Ting Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng-Ting Liao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056605823","display_name":"Pei-Chen Ho","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pei-Chen Ho","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101619495","display_name":"Yu-Hsiang Wang","orcid":"https://orcid.org/0000-0001-7747-4456"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu-Hsiang Wang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034235870","display_name":"Po\u2010Chun Hsu","orcid":"https://orcid.org/0000-0002-6509-9377"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Po-Chun Hsu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5035060493","display_name":"Da-shan Shiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Da-shan Shiu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.08841989,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"24959","last_page":"24973"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9639000296592712,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9639000296592712,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.940500020980835,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9132999777793884,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.8428237438201904},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.7452558875083923},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6354237198829651},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5853805541992188},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.5227111577987671},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.46344757080078125},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.4375791549682617},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3371681571006775},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.11049196124076843},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.07234320044517517}],"concepts":[{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.8428237438201904},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.7452558875083923},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6354237198829651},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5853805541992188},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5227111577987671},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46344757080078125},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.4375791549682617},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3371681571006775},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.11049196124076843},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.07234320044517517},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.findings-acl.1281","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.1281","pdf_url":"https://aclanthology.org/2025.findings-acl.1281.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.findings-acl.1281","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.1281","pdf_url":"https://aclanthology.org/2025.findings-acl.1281.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412944625.pdf","grobid_xml":"https://content.openalex.org/works/W4412944625.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3000097931","https://openalex.org/W2354322770","https://openalex.org/W4237547500","https://openalex.org/W1570848052","https://openalex.org/W2373192430","https://openalex.org/W4239268388","https://openalex.org/W1537496349","https://openalex.org/W4243305035","https://openalex.org/W4390606538","https://openalex.org/W2095903272"],"abstract_inverted_index":{"We":[0],"propose":[1],"\"Generative":[2],"Fusion":[3],"Decoding\"":[4],"(GFD),":[5],"a":[6],"novel":[7],"shallow":[8],"fusion":[9,59],"framework":[10],"designed":[11],"to":[12,37,40,132],"integrate":[13],"large":[14],"language":[15],"models":[16,48,78],"(LLMs)":[17],"into":[18],"cross-modal":[19],"text":[20],"recognition":[21,26,31],"systems":[22],"for":[23,82,87,118],"automatic":[24],"speech":[25],"(ASR)":[27],"and":[28,60,90,95,105,116,123],"optical":[29],"character":[30],"(OCR).We":[32],"derive":[33],"the":[34,53,64,80],"necessary":[35],"formulations":[36],"enable":[38],"GFD":[39,109],"operate":[41],"across":[42],"mismatched":[43],"token":[44],"spaces":[45],"of":[46,114,130],"different":[47],"by":[49,69],"calculating":[50],"likelihood":[51],"at":[52],"byte":[54],"level,":[55],"thereby":[56],"enabling":[57],"seamless":[58],"synchronous":[61],"progression":[62],"during":[63],"decoding":[65],"process.GFD":[66],"is":[67],"plug-and-play":[68],"design,":[70],"making":[71],"it":[72],"readily":[73],"compatible":[74],"with":[75,98],"various":[76],"auto-regressive":[77],"without":[79],"need":[81],"any":[83],"re-training.GFD":[84],"proves":[85],"effective":[86],"general":[88],"ASR":[89,120],"OCR":[91],"tasks":[92],"through":[93],"intermediate":[94],"frequent":[96],"interactions":[97],"LLMs,":[99],"surpassing":[100],"cascaded":[101],"methods":[102],"in":[103,121],"English":[104],"Mandarin":[106],"benchmarks.In":[107],"addition,":[108],"transfers":[110],"in-context":[111],"learning":[112],"abilities":[113],"LLMs":[115],"allows":[117],"adaptive":[119],"instruction-aware":[122],"long-context":[124],"settings,":[125],"yielding":[126],"significant":[127],"WER":[128],"reductions":[129],"up":[131],"17.7%.":[133]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
