{"id":"https://openalex.org/W7138018014","doi":"https://doi.org/10.1609/aaai.v40i24.39080","title":"D3ToM: Decider-Guided Dynamic Token Merging for Accelerating Diffusion MLLMs","display_name":"D3ToM: Decider-Guided Dynamic Token Merging for Accelerating Diffusion MLLMs","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138018014","doi":"https://doi.org/10.1609/aaai.v40i24.39080"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i24.39080","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i24.39080","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/39080/43042","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/39080/43042","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129738686","display_name":"Shuochen Chang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Shuochen Chang","raw_affiliation_strings":["MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129727119","display_name":"Xiaofeng Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaofeng Zhang","raw_affiliation_strings":["MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129666511","display_name":"Qingyang Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qingyang Liu","raw_affiliation_strings":["MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129673774","display_name":"Li Niu","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li Niu","raw_affiliation_strings":["MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5129738686"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.30074627,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"24","first_page":"19961","last_page":"19969"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.4065000116825104,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.4065000116825104,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.1843000054359436,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.13459999859333038,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7591000199317932},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6378999948501587},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.6247000098228455},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.593500018119812},{"id":"https://openalex.org/keywords/merge","display_name":"Merge (version control)","score":0.5449000000953674},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.4318999946117401},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.34869998693466187},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.34790000319480896}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7591000199317932},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7440000176429749},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6378999948501587},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.6247000098228455},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.593500018119812},{"id":"https://openalex.org/C197129107","wikidata":"https://www.wikidata.org/wiki/Q1921621","display_name":"Merge (version control)","level":2,"score":0.5449000000953674},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.49709999561309814},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.4318999946117401},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4198000133037567},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.34869998693466187},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.34790000319480896},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.34310001134872437},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3393999934196472},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.33379998803138733},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.3246999979019165},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.2985000014305115},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2962999939918518},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.29030001163482666},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.28859999775886536},{"id":"https://openalex.org/C68710425","wikidata":"https://www.wikidata.org/wiki/Q5275442","display_name":"Diffusion process","level":3,"score":0.27720001339912415},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.274399995803833},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.2565999925136566}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i24.39080","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i24.39080","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/39080/43042","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i24.39080","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i24.39080","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/39080/43042","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7138018014.pdf","grobid_xml":"https://content.openalex.org/works/W7138018014.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Diffusion-based":[0],"multimodal":[1],"large":[2],"language":[3],"models":[4],"(Diffusion":[5],"MLLMs)":[6],"have":[7],"recently":[8],"demonstrated":[9],"impressive":[10],"non-autoregressive":[11],"generative":[12],"capabilities":[13],"across":[14],"vision-and-language":[15],"tasks.":[16],"However,":[17],"Diffusion":[18,79,164],"MLLMs":[19],"exhibit":[20],"substantially":[21],"slower":[22],"inference":[23,77,179],"than":[24],"autoregressive":[25],"models:":[26],"Each":[27],"denoising":[28,73,83,94,155],"step":[29],"employs":[30,146],"full":[31],"bidirectional":[32],"self-attention":[33],"over":[34,100],"the":[35,92,110,116,132,159],"entire":[36],"sequence,":[37],"resulting":[38],"in":[39,78,91],"cubic":[40],"decoding":[41,161],"complexity":[42],"that":[43,65,150,176],"becomes":[44],"computationally":[45],"impractical":[46],"with":[47,153,158],"thousands":[48],"of":[49,109,163],"visual":[50,69,102,133],"tokens.":[51,103],"To":[52],"address":[53],"this":[54],"challenge,":[55],"we":[56],"propose":[57],"D\u00b3ToM,":[58],"a":[59,107,126,147],"Decider-guided":[60],"dynamic":[61],"token":[62,134],"merging":[63],"method":[64],"dynamically":[66,151],"merges":[67,115],"redundant":[68],"tokens":[70,89,113],"at":[71],"different":[72],"steps":[74],"to":[75],"accelerate":[76],"MLLMs.":[80],"At":[81],"each":[82,154],"step,":[84,156],"D\u00b3ToM":[85,145,177],"uses":[86],"decider":[87],"tokens\u2014the":[88],"generated":[90],"previous":[93],"step\u2014to":[95],"build":[96],"an":[97],"importance":[98],"map":[99],"all":[101,137],"Then":[104],"it":[105],"maintains":[106],"proportion":[108],"most":[111],"salient":[112],"and":[114],"remainder":[117],"through":[118],"similarity-based":[119],"aggregation.":[120],"This":[121],"plug-and-play":[122],"module":[123],"integrates":[124],"into":[125],"single":[127],"transformer":[128],"layer,":[129],"physically":[130],"shortening":[131],"sequence":[135],"for":[136],"subsequent":[138],"layers":[139],"without":[140],"altering":[141],"model":[142],"parameters.":[143],"Moreover,":[144],"merge":[148],"ratio":[149],"varies":[152],"aligns":[157],"native":[160],"process":[162],"MLLMs,":[165],"achieving":[166],"superior":[167],"performance":[168],"under":[169],"equivalent":[170],"computational":[171],"budgets.":[172],"Extensive":[173],"experiments":[174],"show":[175],"accelerates":[178],"while":[180],"preserving":[181],"competitive":[182],"performance.":[183]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
