{"id":"https://openalex.org/W7146967167","doi":"https://doi.org/10.1109/icvisp68610.2025.11451708","title":"Vision-driven Adaptive Decoding for Document Understanding: A Dual-Path Visual-Language and OCR Framework with Prompt-aware Task Routing","display_name":"Vision-driven Adaptive Decoding for Document Understanding: A Dual-Path Visual-Language and OCR Framework with Prompt-aware Task Routing","publication_year":2025,"publication_date":"2025-11-28","ids":{"openalex":"https://openalex.org/W7146967167","doi":"https://doi.org/10.1109/icvisp68610.2025.11451708"},"language":null,"primary_location":{"id":"doi:10.1109/icvisp68610.2025.11451708","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icvisp68610.2025.11451708","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 9th International Conference on Vision, Image and Signal Processing (ICVISP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100721606","display_name":"Ao Shen","orcid":"https://orcid.org/0000-0001-7196-2168"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ao Shen","raw_affiliation_strings":["Beihang University,School of Artificial Intelligence,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,School of Artificial Intelligence,Beijing,China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132675611","display_name":"Tongfei Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tongfei Chen","raw_affiliation_strings":["Beihang University,School of Artificial Intelligence,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,School of Artificial Intelligence,Beijing,China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049645701","display_name":"Yilin Sun","orcid":"https://orcid.org/0000-0001-9390-4030"},"institutions":[{"id":"https://openalex.org/I202334528","display_name":"Beijing Electronic Science and Technology Institute","ror":"https://ror.org/01xdzh226","country_code":"CN","type":"education","lineage":["https://openalex.org/I202334528"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yangyang Sun","raw_affiliation_strings":["Beijing Institute of Control and Electronics Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Control and Electronics Technology,Beijing,China","institution_ids":["https://openalex.org/I202334528"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101773676","display_name":"Feng Jiang","orcid":"https://orcid.org/0000-0001-8817-6657"},"institutions":[{"id":"https://openalex.org/I202334528","display_name":"Beijing Electronic Science and Technology Institute","ror":"https://ror.org/01xdzh226","country_code":"CN","type":"education","lineage":["https://openalex.org/I202334528"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feng Jiang","raw_affiliation_strings":["Beijing Institute of Control and Electronics Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Control and Electronics Technology,Beijing,China","institution_ids":["https://openalex.org/I202334528"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066430668","display_name":"Yaogong Feng","orcid":null},"institutions":[{"id":"https://openalex.org/I202334528","display_name":"Beijing Electronic Science and Technology Institute","ror":"https://ror.org/01xdzh226","country_code":"CN","type":"education","lineage":["https://openalex.org/I202334528"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yaogong Feng","raw_affiliation_strings":["Beijing Institute of Control and Electronics Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Control and Electronics Technology,Beijing,China","institution_ids":["https://openalex.org/I202334528"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037465848","display_name":"Kun Hu","orcid":"https://orcid.org/0000-0001-5044-7327"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kun Hu","raw_affiliation_strings":["Beihang University,School of Artificial Intelligence,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,School of Artificial Intelligence,Beijing,China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5124193135","display_name":"Mengqi Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I6507939","display_name":"China United Network Communications Group (China)","ror":"https://ror.org/028w99c90","country_code":"CN","type":"company","lineage":["https://openalex.org/I6507939"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengqi Liu","raw_affiliation_strings":["China Information Communication Technologies Group Design Institute Co., Ltd., China Unicom,Beijing,China"],"affiliations":[{"raw_affiliation_string":"China Information Communication Technologies Group Design Institute Co., Ltd., China Unicom,Beijing,China","institution_ids":["https://openalex.org/I6507939"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100721606"],"corresponding_institution_ids":["https://openalex.org/I82880672"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.75243992,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9369999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9369999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.016100000590085983,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.004900000058114529,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/digitization","display_name":"Digitization","score":0.5231000185012817},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4690000116825104},{"id":"https://openalex.org/keywords/table","display_name":"Table (database)","score":0.462799996137619},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4494999945163727},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.43849998712539673},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.4309999942779541},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.39820000529289246},{"id":"https://openalex.org/keywords/document-layout-analysis","display_name":"Document layout analysis","score":0.3896999955177307},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.352400004863739}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8360000252723694},{"id":"https://openalex.org/C2779308522","wikidata":"https://www.wikidata.org/wiki/Q843958","display_name":"Digitization","level":2,"score":0.5231000185012817},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.512499988079071},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5049999952316284},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4699000120162964},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4690000116825104},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.462799996137619},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4494999945163727},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.43849998712539673},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.4309999942779541},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.39820000529289246},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.39739999175071716},{"id":"https://openalex.org/C72773152","wikidata":"https://www.wikidata.org/wiki/Q5287629","display_name":"Document layout analysis","level":3,"score":0.3896999955177307},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.352400004863739},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.35199999809265137},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.34929999709129333},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.3361999988555908},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.32690000534057617},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3010999858379364},{"id":"https://openalex.org/C67905146","wikidata":"https://www.wikidata.org/wiki/Q5287646","display_name":"Document processing","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C79158427","wikidata":"https://www.wikidata.org/wiki/Q485396","display_name":"Analytics","level":2,"score":0.2948000133037567},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2897000014781952},{"id":"https://openalex.org/C2779135771","wikidata":"https://www.wikidata.org/wiki/Q403574","display_name":"Named-entity recognition","level":3,"score":0.28600001335144043},{"id":"https://openalex.org/C2777737414","wikidata":"https://www.wikidata.org/wiki/Q4868296","display_name":"Font","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2768999934196472},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2676999866962433},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26739999651908875},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2653000056743622},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2637999951839447},{"id":"https://openalex.org/C177937566","wikidata":"https://www.wikidata.org/wiki/Q4223102","display_name":"Document clustering","level":3,"score":0.26030001044273376},{"id":"https://openalex.org/C2983812711","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Text recognition","level":3,"score":0.25839999318122864},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.2531999945640564}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icvisp68610.2025.11451708","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icvisp68610.2025.11451708","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 9th International Conference on Vision, Image and Signal Processing (ICVISP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4736499786376953,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":2,"referenced_works":["https://openalex.org/W2997154779","https://openalex.org/W4413145313"],"related_works":[],"abstract_inverted_index":{"Real-world":[0],"document":[1,131],"understanding":[2],"faces":[3],"challenges":[4],"such":[5],"as":[6],"layout":[7,71],"diversity,":[8],"low-quality":[9],"imaging,":[10],"and":[11,18,72,85,100,107,124,126,133,140],"multilingual":[12],"text,":[13],"which":[14],"hinder":[15],"text":[16,84,98],"recognition":[17],"table":[19,101],"structure":[20],"reconstruction.":[21],"Traditional":[22],"OCR-based":[23,63],"pipelines":[24],"achieve":[25],"high":[26],"character-level":[27],"precision":[28,116],"but":[29],"lack":[30],"global":[31],"semantic":[32,73],"understanding,":[33,125],"while":[34],"general-purpose":[35],"vision-language":[36],"models":[37],"(VLMs)":[38],"often":[39],"struggle":[40],"on":[41,90],"text-heavy":[42],"documents.":[43],"To":[44],"address":[45],"these":[46],"limitations,":[47],"we":[48],"propose":[49],"VADoc":[50,94,128],"(Vision-driven":[51],"Adaptive":[52],"Document":[53],"Understanding),":[54],"a":[55],"dual-branch":[56],"multimodal":[57],"fusion":[58,77],"framework":[59],"that":[60,93,113,127],"integrates":[61],"an":[62],"branch":[64],"for":[65,70,137],"token-level":[66],"grounding":[67],"with":[68,117],"Qwen2.5-VL":[69],"reasoning.":[74],"A":[75],"lightweight":[76],"module":[78],"aligns":[79],"both":[80,97],"outputs":[81],"into":[82],"coherent":[83],"structured":[86,134],"HTML":[87],"representations.":[88],"Experiments":[89],"OmniDocBench":[91],"show":[92],"substantially":[95],"improves":[96],"accuracy":[99],"reconstruction":[102],"quality":[103],"over":[104],"standalone":[105],"VLMs":[106],"OCR":[108,115],"systems.":[109],"These":[110],"results":[111],"indicate":[112],"combining":[114],"VLM":[118],"contextual":[119],"reasoning":[120],"effectively":[121],"bridges":[122],"perception":[123],"supports":[129],"robust":[130],"digitization":[132],"data":[135],"extraction":[136],"downstream":[138],"NLP":[139],"analytics":[141],"tasks.":[142]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
