{"id":"https://openalex.org/W7160444000","doi":"https://doi.org/10.48550/arxiv.2605.02946","title":"RouteHijack: Routing-Aware Attack on Mixture-of-Experts LLMs","display_name":"RouteHijack: Routing-Aware Attack on Mixture-of-Experts LLMs","publication_year":2026,"publication_date":"2026-05-01","ids":{"openalex":"https://openalex.org/W7160444000","doi":"https://doi.org/10.48550/arxiv.2605.02946"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.02946","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02946","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.02946","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135522476","display_name":"Zhiyuan Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xu, Zhiyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102722651","display_name":"Joseph Gardiner","orcid":"https://orcid.org/0000-0003-4748-4228"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gardiner, Joseph","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044341219","display_name":"Sana Belguith","orcid":"https://orcid.org/0000-0003-0069-8552"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Belguith, Sana","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5045570758","display_name":"Lichao Wu","orcid":"https://orcid.org/0000-0002-7139-732X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Lichao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5135522476"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.7939000129699707,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.7939000129699707,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.041999999433755875,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.024800000712275505,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.7608000040054321},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5030999779701233},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.46459999680519104},{"id":"https://openalex.org/keywords/vulnerability","display_name":"Vulnerability (computing)","score":0.46320000290870667},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.46050000190734863},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4453999996185303},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.41909998655319214},{"id":"https://openalex.org/keywords/restrictiveness","display_name":"Restrictiveness","score":0.37770000100135803}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.7608000040054321},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.5590000152587891},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5590000152587891},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5030999779701233},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.46459999680519104},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.46320000290870667},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.46050000190734863},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4453999996185303},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.41909998655319214},{"id":"https://openalex.org/C2776435913","wikidata":"https://www.wikidata.org/wiki/Q7316334","display_name":"Restrictiveness","level":2,"score":0.37770000100135803},{"id":"https://openalex.org/C2777693866","wikidata":"https://www.wikidata.org/wiki/Q359099","display_name":"Blackout","level":4,"score":0.3727000057697296},{"id":"https://openalex.org/C2780665704","wikidata":"https://www.wikidata.org/wiki/Q959298","display_name":"Intervention (counseling)","level":2,"score":0.32760000228881836},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.32330000400543213},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3077999949455261},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.30660000443458557},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.3037000000476837},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.2971999943256378},{"id":"https://openalex.org/C167063184","wikidata":"https://www.wikidata.org/wiki/Q1400839","display_name":"Vulnerability assessment","level":3,"score":0.29420000314712524},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.2930999994277954},{"id":"https://openalex.org/C3017944768","wikidata":"https://www.wikidata.org/wiki/Q1450463","display_name":"Poison control","level":2,"score":0.28049999475479126},{"id":"https://openalex.org/C65856478","wikidata":"https://www.wikidata.org/wiki/Q3991682","display_name":"Attack model","level":2,"score":0.2700999975204468},{"id":"https://openalex.org/C147494362","wikidata":"https://www.wikidata.org/wiki/Q2078905","display_name":"Troubleshooting","level":2,"score":0.2623000144958496},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.26089999079704285},{"id":"https://openalex.org/C2777834853","wikidata":"https://www.wikidata.org/wiki/Q96776939","display_name":"Liability","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.02946","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02946","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.02946","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02946","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6717753410339355,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Safety":[0],"alignment":[1],"is":[2,89,93,168],"critical":[3],"for":[4,83,241],"the":[5,70,165,239],"responsible":[6],"deployment":[7],"of":[8,99],"large":[9],"language":[10],"models":[11,67],"(LLMs).":[12],"As":[13],"Mixture-of-Experts":[14],"(MoE)":[15],"architectures":[16,236],"are":[17,62],"increasingly":[18],"adopted":[19],"to":[20,52,65,69,104,125,170,211,216,225],"scale":[21],"model":[22,46,106],"capacity,":[23],"understanding":[24],"their":[25],"safety":[26,91,151],"robustness":[27],"becomes":[28],"essential.":[29],"Existing":[30],"adversarial":[31,143],"attacks,":[32],"however,":[33],"have":[34],"notable":[35],"limitations.":[36],"Prompt-based":[37],"jailbreaks":[38],"rely":[39],"on":[40,116],"heuristic":[41],"search":[42],"and":[43,55,61,128,137,156,213,237],"transfer":[44],"poorly,":[45],"intervention":[47],"methods":[48],"require":[49],"privileged":[50],"access":[51],"internal":[53],"representations,":[54],"optimization-based":[56,193],"input":[57,113,176],"attacks":[58],"remain":[59],"output-centric":[60],"fundamentally":[63],"limited":[64],"MoE":[66,84,180,204],"due":[68],"non-differentiable":[71],"routing":[72,110],"mechanism.":[73],"In":[74],"this":[75,117],"paper,":[76],"we":[77],"present":[78],"RouteHijack,":[79],"a":[80,96,146,171,184,230],"routing-aware":[81,147],"jailbreak":[82],"LLMs.":[85],"Our":[86],"key":[87],"insight":[88],"that":[90,149],"behavior":[92,107],"concentrated":[94],"in":[95,233],"small":[97],"subset":[98],"experts,":[100,152,155],"creating":[101],"an":[102],"opportunity":[103],"steer":[105],"by":[108,131,195],"influencing":[109],"decisions":[111],"through":[112],"optimization.":[114],"Building":[115],"observation,":[118],"RouteHijack":[119,182,197],"first":[120],"performs":[121],"response-driven":[122],"expert":[123,235],"localization":[124],"identify":[126],"safety-critical":[127],"harmful":[129,138,154],"experts":[130],"contrasting":[132],"activations":[133],"under":[134],"safe":[135],"refusals":[136],"completions.":[139],"It":[140],"then":[141],"constructs":[142],"suffixes":[144],"with":[145],"objective":[148],"suppresses":[150],"promotes":[153],"prevents":[157],"early-stage":[158],"refusal":[159],"during":[160],"generation.":[161],"At":[162],"inference":[163],"time,":[164],"optimized":[166],"suffix":[167],"appended":[169],"malicious":[172],"prompt,":[173],"requiring":[174],"only":[175],"access.":[177],"Across":[178],"seven":[179],"LLMs,":[181],"achieves":[183],"69.3\\%":[185],"average":[186,207,221],"attack":[187,194],"success":[188],"rate":[189],"(ASR),":[190],"outperforming":[191],"prior":[192],"$3.2\\times$.":[196],"also":[198],"transfers":[199],"zero-shot":[200],"across":[201],"five":[202],"sibling":[203],"variants,":[205],"raising":[206],"ASR":[208,222],"from":[209,223],"27.7\\%":[210],"61.2\\%,":[212],"further":[214],"generalizes":[215],"three":[217],"MoE-based":[218],"VLMs,":[219],"increasing":[220],"2.47\\%":[224],"38.7\\%.":[226],"These":[227],"findings":[228],"expose":[229],"fundamental":[231],"vulnerability":[232],"sparse":[234],"highlight":[238],"need":[240],"defenses":[242],"beyond":[243],"output-level":[244],"alignment.":[245]},"counts_by_year":[],"updated_date":"2026-05-07T06:12:12.454206","created_date":"2026-05-07T00:00:00"}
