{"id":"https://openalex.org/W4415539037","doi":"https://doi.org/10.1145/3746027.3755393","title":"DR-VQA: Decompose-then-Reconstruct for Visual Question Answering in BLV Assistance","display_name":"DR-VQA: Decompose-then-Reconstruct for Visual Question Answering in BLV Assistance","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415539037","doi":"https://doi.org/10.1145/3746027.3755393"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755393","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3755393","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746027.3755393","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101315769","display_name":"Bocheng Pan","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090209","display_name":"Institute of Microelectronics","ror":"https://ror.org/009rw8n36","country_code":"SG","type":"facility","lineage":["https://openalex.org/I115228651","https://openalex.org/I4210090209","https://openalex.org/I91275662"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Bocheng Pan","raw_affiliation_strings":["Institute of Microelectronics, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Microelectronics, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090983492","display_name":"Hailong Shi","orcid":"https://orcid.org/0009-0002-6545-1999"},"institutions":[{"id":"https://openalex.org/I4210090209","display_name":"Institute of Microelectronics","ror":"https://ror.org/009rw8n36","country_code":"SG","type":"facility","lineage":["https://openalex.org/I115228651","https://openalex.org/I4210090209","https://openalex.org/I91275662"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Hailong Shi","raw_affiliation_strings":["Institute of Microelectronics, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Microelectronics, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090209"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5004694827","display_name":"Xingyu Gao","orcid":"https://orcid.org/0000-0002-4660-8092"},"institutions":[{"id":"https://openalex.org/I4210090209","display_name":"Institute of Microelectronics","ror":"https://ror.org/009rw8n36","country_code":"SG","type":"facility","lineage":["https://openalex.org/I115228651","https://openalex.org/I4210090209","https://openalex.org/I91275662"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Xingyu Gao","raw_affiliation_strings":["Institute of Microelectronics, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Microelectronics, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090209"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101315769"],"corresponding_institution_ids":["https://openalex.org/I4210090209"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32455623,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"4339","last_page":"4348"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9556999802589417,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.6434000134468079},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4268999993801117},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.4163999855518341},{"id":"https://openalex.org/keywords/imperfect","display_name":"Imperfect","score":0.39989998936653137},{"id":"https://openalex.org/keywords/transformative-learning","display_name":"Transformative learning","score":0.39910000562667847},{"id":"https://openalex.org/keywords/image-editing","display_name":"Image editing","score":0.3797999918460846},{"id":"https://openalex.org/keywords/visual-processing","display_name":"Visual processing","score":0.35199999809265137},{"id":"https://openalex.org/keywords/eye-tracking","display_name":"Eye tracking","score":0.33559998869895935}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7124999761581421},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.6434000134468079},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5164999961853027},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.491100013256073},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4268999993801117},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.4163999855518341},{"id":"https://openalex.org/C2780310539","wikidata":"https://www.wikidata.org/wiki/Q12547192","display_name":"Imperfect","level":2,"score":0.39989998936653137},{"id":"https://openalex.org/C70587473","wikidata":"https://www.wikidata.org/wiki/Q7834111","display_name":"Transformative learning","level":2,"score":0.39910000562667847},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.3797999918460846},{"id":"https://openalex.org/C2778251979","wikidata":"https://www.wikidata.org/wiki/Q7936617","display_name":"Visual processing","level":3,"score":0.35199999809265137},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.33559998869895935},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3321000039577484},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.30730000138282776},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.3043000102043152},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.3034000098705292},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.30160000920295715},{"id":"https://openalex.org/C2777055276","wikidata":"https://www.wikidata.org/wiki/Q7936580","display_name":"Visual approach","level":2,"score":0.29260000586509705},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.28929999470710754},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.2824999988079071},{"id":"https://openalex.org/C63075964","wikidata":"https://www.wikidata.org/wiki/Q3277307","display_name":"Visual rhetoric","level":3,"score":0.27880001068115234},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.27320000529289246},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.266400009393692},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25459998846054077},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.25}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755393","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3755393","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3746027.3755393","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3755393","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W1933349210","https://openalex.org/W2022699114","https://openalex.org/W2090048052","https://openalex.org/W2092460372","https://openalex.org/W2289988710","https://openalex.org/W2529436507","https://openalex.org/W2745461083","https://openalex.org/W2789636240","https://openalex.org/W2963349562","https://openalex.org/W2988916019","https://openalex.org/W3162032064","https://openalex.org/W3199693760","https://openalex.org/W4214490042","https://openalex.org/W4309674289","https://openalex.org/W4312910992","https://openalex.org/W4380995299","https://openalex.org/W4392172801","https://openalex.org/W4393161084","https://openalex.org/W4402727624"],"related_works":[],"abstract_inverted_index":{"Visual":[0,82],"impairment":[1],"affects":[2],"over":[3],"200":[4],"million":[5],"individuals":[6],"globally,":[7],"creating":[8],"significant":[9],"challenges":[10],"in":[11,75,183],"daily":[12],"visual":[13,67,93,142,198,210],"tasks.":[14],"While":[15],"vision-language":[16],"models":[17,56],"offer":[18],"transformative":[19],"assistive":[20],"potential,":[21],"existing":[22],"systems":[23],"based":[24,116],"on":[25,117],"Multimodal":[26],"Large":[27],"Language":[28],"Models":[29],"(MLLMs)":[30],"face":[31],"a":[32,85,127,154],"serious":[33],"cross-contamination":[34,98],"problem":[35],"when":[36,47],"processing":[37,49,107],"real-world":[38],"images":[39,51],"captured":[40],"by":[41,60,123],"blind":[42],"and":[43,52],"low-vision":[44],"(BLV)":[45],"users:":[46],"jointly":[48],"imperfect":[50],"specific":[53],"questions,":[54],"current":[55],"are":[57,114],"often":[58],"misled":[59],"question":[61,109,149],"assumptions":[62],"rather":[63],"than":[64],"adhering":[65],"to":[66,137,164,208],"facts,":[68],"generating":[69],"hallucinations":[70],"about":[71],"objects":[72],"not":[73],"present":[74],"the":[76,131,158],"image.":[77],"We":[78],"introduce":[79],"DR-VQA":[80],"(Decompose-then-Reconstruct":[81],"Question":[83],"Answering),":[84],"novel":[86],"framework":[87],"that":[88,167],"balances":[89],"user":[90,138,145,194],"intent":[91,195],"with":[92,144,161,196],"facts.":[94],"Our":[95,102],"approach":[96,103],"prevents":[97],"through":[99,126],"structured":[100,128],"reasoning.":[101],"deliberately":[104],"separates":[105],"image":[106,118],"from":[108,205],"analysis,":[110],"ensuring":[111],"model-generated":[112],"descriptions":[113,143],"strictly":[115],"facts":[119],"without":[120],"being":[121],"influenced":[122],"questions.":[124],"Subsequently,":[125],"decomposition":[129],"mechanism,":[130],"system":[132],"generates":[133],"targeted":[134],"sub-questions":[135],"relevant":[136],"intent,":[139],"gradually":[140],"aligning":[141],"needs":[146],"while":[147,187],"minimizing":[148],"bias.":[150],"During":[151],"final":[152],"synthesis,":[153],"memory-reset":[155],"LLM":[156],"reconstructs":[157],"reasoning":[159],"chain":[160],"detailed":[162],"information":[163,175],"generate":[165],"responses":[166],"either":[168],"provide":[169],"evidence-supported":[170],"conclusions":[171],"or":[172],"transparently":[173],"acknowledge":[174],"limitations.":[176],"Experimental":[177],"evaluations":[178],"demonstrate":[179],"our":[180],"framework's":[181],"effectiveness":[182],"reducing":[184],"hallucination":[185],"risks":[186],"improving":[188],"answer":[189],"accuracy.":[190],"By":[191],"systematically":[192],"balancing":[193],"factual":[197],"evidence,":[199],"this":[200],"work":[201],"advances":[202],"BLV-assistive":[203],"technologies":[204],"probabilistic":[206],"outputs":[207],"reliable":[209],"assistance":[211],"services.":[212]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-25T00:00:00"}
