{"id":"https://openalex.org/W7138253513","doi":"https://doi.org/10.48550/arxiv.2603.14989","title":"MMSpec: Benchmarking Speculative Decoding for Vision-Language Models","display_name":"MMSpec: Benchmarking Speculative Decoding for Vision-Language Models","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7138253513","doi":"https://doi.org/10.48550/arxiv.2603.14989"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.14989","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14989","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.14989","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129716111","display_name":"Hui Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shen, Hui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129730964","display_name":"Xin Eric Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129746753","display_name":"Ping Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ping","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129701302","display_name":"Yunta Hsieh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hsieh, Yunta","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129688910","display_name":"Qi Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Qi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129727585","display_name":"Zhongwei Wan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wan, Zhongwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129642655","display_name":"Ziheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ziheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126827023","display_name":"Jingxuan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jingxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129747692","display_name":"Jing Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Jing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129683818","display_name":"Ziyuan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129725849","display_name":"Yifan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yifan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010356912","display_name":"H. Cao","orcid":"https://orcid.org/0009-0005-1611-7478"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Hangrui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129654059","display_name":"Chenyang Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Chenyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129662148","display_name":"Mi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Mi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":["https://openalex.org/A5129716111"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8804000020027161,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8804000020027161,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.016100000590085983,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.01080000028014183,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.7566999793052673},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5214999914169312},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5121999979019165},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5120999813079834},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5077999830245972},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4742000102996826},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4399000108242035}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8054999709129333},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.7566999793052673},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5214999914169312},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5121999979019165},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5120999813079834},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5077999830245972},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4742000102996826},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4399000108242035},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.43230000138282776},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4122999906539917},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.39649999141693115},{"id":"https://openalex.org/C193969084","wikidata":"https://www.wikidata.org/wiki/Q7452500","display_name":"Sequential decoding","level":4,"score":0.3652999997138977},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35749998688697815},{"id":"https://openalex.org/C47941915","wikidata":"https://www.wikidata.org/wiki/Q107885","display_name":"Speculation","level":2,"score":0.35030001401901245},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.30809998512268066},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2904999852180481},{"id":"https://openalex.org/C40743351","wikidata":"https://www.wikidata.org/wiki/Q7002049","display_name":"Neural decoding","level":3,"score":0.2526000142097473},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.14989","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14989","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.14989","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14989","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-language":[0],"models":[1],"(VLMs)":[2],"achieve":[3],"strong":[4],"performance":[5],"on":[6],"multimodal":[7,22,58,90],"tasks":[8],"but":[9],"suffer":[10],"from":[11],"high":[12],"inference":[13],"latency":[14,111],"due":[15],"to":[16,129],"large":[17],"model":[18],"sizes":[19],"and":[20,64,102,132],"long":[21],"contexts.":[23],"Speculative":[24],"decoding":[25,51,69,123],"has":[26],"recently":[27],"emerged":[28],"as":[29],"an":[30],"effective":[31],"acceleration":[32],"technique,":[33],"yet":[34],"its":[35],"behavior":[36],"in":[37,52,89],"VLMs":[38],"remains":[39],"insufficiently":[40],"understood.":[41],"We":[42],"introduce":[43],"MMSpec,":[44],"the":[45],"first":[46],"benchmark":[47],"for":[48,85],"evaluating":[49],"speculative":[50,68,122],"vision-language":[53],"models.":[54],"MMSpec":[55],"contains":[56],"600":[57],"samples":[59],"across":[60],"six":[61],"task":[62],"categories":[63],"integrates":[65],"ten":[66],"representative":[67],"algorithms":[70],"under":[71],"a":[72,120],"unified":[73],"evaluation":[74],"framework.":[75],"Our":[76],"study":[77],"reveals":[78],"three":[79],"key":[80],"findings:":[81],"(1)":[82],"methods":[83],"designed":[84],"text-only":[86],"LLMs":[87],"degrade":[88],"scenarios,":[91],"(2)":[92],"vision":[93,130],"awareness":[94],"becomes":[95],"increasingly":[96],"important":[97],"at":[98],"larger":[99],"batch":[100],"sizes,":[101],"(3)":[103],"throughput":[104],"speedup":[105],"alone":[106],"does":[107],"not":[108],"reliably":[109],"reflect":[110],"performance.":[112,135],"Motivated":[113],"by":[114],"these":[115],"findings,":[116],"we":[117],"propose":[118],"ViSkip,":[119],"plug-and-play":[121],"method":[124],"that":[125],"dynamically":[126],"adapts":[127],"speculation":[128],"tokens":[131],"achieves":[133],"state-of-the-art":[134]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
