{"id":"https://openalex.org/W4416036166","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.717","title":"MMAPG: A Training-Free Framework for Multimodal Multi-hop Question Answering via Adaptive Planning Graphs","display_name":"MMAPG: A Training-Free Framework for Multimodal Multi-hop Question Answering via Adaptive Planning Graphs","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416036166","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.717"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.717","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.717","pdf_url":"https://aclanthology.org/2025.emnlp-main.717.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.717.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069090040","display_name":"Yiheng Hu","orcid":"https://orcid.org/0000-0001-6127-1101"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yiheng Hu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087768777","display_name":"Xiaoyang Wang","orcid":"https://orcid.org/0000-0003-3554-3219"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaoyang Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010128384","display_name":"Qing Liu","orcid":"https://orcid.org/0000-0002-5797-8179"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qing Liu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006841485","display_name":"Xiwei Xu","orcid":"https://orcid.org/0000-0002-2273-1862"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiwei Xu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014372038","display_name":"Qian Fu","orcid":"https://orcid.org/0000-0002-7126-2994"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qian Fu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113159317","display_name":"Wenjie Zhang","orcid":"https://orcid.org/0009-0008-9897-8416"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wenjie Zhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5064683660","display_name":"Liming Zhu","orcid":"https://orcid.org/0000-0001-5839-3765"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liming Zhu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5069090040"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35225319,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"14195","last_page":"14211"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5728999972343445,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5728999972343445,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.10639999806880951,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.0934000015258789,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.5629000067710876},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.29019999504089355},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.2793000042438507},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.27140000462532043},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.27129998803138733}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6453999876976013},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.5629000067710876},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5108000040054321},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.2793000042438507},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.27140000462532043},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2547999918460846},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.24860000610351562},{"id":"https://openalex.org/C180198813","wikidata":"https://www.wikidata.org/wiki/Q121182","display_name":"Information system","level":2,"score":0.24369999766349792}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.717","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.717","pdf_url":"https://aclanthology.org/2025.emnlp-main.717.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.717","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.717","pdf_url":"https://aclanthology.org/2025.emnlp-main.717.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416036166.pdf","grobid_xml":"https://content.openalex.org/works/W4416036166.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Multi-hop":[1],"question":[2],"answering":[3],"requires":[4],"integrating":[5],"information":[6,135],"from":[7],"diverse":[8],"sources,":[9],"such":[10],"as":[11],"images":[12],"and":[13,24,77,95,104,150],"texts,":[14],"to":[15,40,43,97,114,125],"derive":[16],"answers.Existing":[17],"methods":[18],"typically":[19],"rely":[20,162],"on":[21,30,148,163],"sequential":[22],"retrieval":[23,76,111],"reasoning,":[25],"where":[26,96],"each":[27],"step":[28],"builds":[29],"the":[31,83,87,92,99,131,146],"previous":[32],"output.However,":[33],"this":[34],"single-path":[35],"paradigm":[36],"makes":[37],"them":[38],"vulnerable":[39],"errors":[41],"due":[42],"misleading":[44],"intermediate":[45],"steps.Moreover,":[46],"developing":[47],"multimodal":[48,134],"models":[49,160],"can":[50],"be":[51],"computationally":[52],"expensive,":[53],"often":[54],"requiring":[55],"extensive":[56],"training.To":[57],"address":[58],"these":[59],"limitations,":[60],"we":[61,118],"propose":[62],"a":[63],"training-free":[64],"framework":[65],"guided":[66],"by":[67],"an":[68],"Adaptive":[69,88],"Planning":[70,89],"Graph,":[71,90],"which":[72,101],"consists":[73],"of":[74,86,107,112,133],"planning,":[75],"reasoning":[78,108],"modules.The":[79],"planning":[80],"module":[81],"analyzes":[82],"current":[84],"state":[85],"determines":[91],"next":[93],"action":[94],"expand":[98],"graph,":[100],"enables":[102],"dynamic":[103],"flexible":[105],"exploration":[106],"paths.To":[109],"handle":[110],"text":[113],"unspecified":[115],"target":[116],"modalities,":[117],"devise":[119],"modality-specific":[120],"strategies":[121],"that":[122,153,161],"dynamically":[123],"adapt":[124],"distinct":[126],"data":[127],"types.Our":[128],"approach":[129,155],"preserves":[130],"characteristics":[132],"without":[136],"costly":[137],"task-specific":[138],"training,":[139],"enabling":[140],"seamless":[141],"integration":[142],"with":[143],"upto-date":[144],"models.Finally,":[145],"experiments":[147],"MultimodalQA":[149],"WebQA":[151],"show":[152],"our":[154],"matches":[156],"or":[157],"outperforms":[158],"existing":[159],"training.":[164]},"counts_by_year":[],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2025-11-08T00:00:00"}
