{"id":"https://openalex.org/W4410745169","doi":"https://doi.org/10.1007/s11263-025-02480-w","title":"AutoViT: Achieving Real-Time Vision Transformers on Mobile via Latency-aware Coarse-to-Fine Search","display_name":"AutoViT: Achieving Real-Time Vision Transformers on Mobile via Latency-aware Coarse-to-Fine Search","publication_year":2025,"publication_date":"2025-05-26","ids":{"openalex":"https://openalex.org/W4410745169","doi":"https://doi.org/10.1007/s11263-025-02480-w"},"language":"en","primary_location":{"id":"doi:10.1007/s11263-025-02480-w","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11263-025-02480-w","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11263-025-02480-w.pdf","source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s11263-025-02480-w.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5078971265","display_name":"Zhenglun Kong","orcid":"https://orcid.org/0000-0002-8120-4456"},"institutions":[{"id":"https://openalex.org/I12912129","display_name":"Northeastern University","ror":"https://ror.org/04t5xt781","country_code":"US","type":"education","lineage":["https://openalex.org/I12912129"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Zhenglun Kong","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Northeastern University, Boston, MA, 02115, USA"],"raw_orcid":"https://orcid.org/0000-0002-8120-4456","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Northeastern University, Boston, MA, 02115, USA","institution_ids":["https://openalex.org/I12912129"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068433690","display_name":"Dongkuan Xu","orcid":"https://orcid.org/0000-0002-1456-9658"},"institutions":[{"id":"https://openalex.org/I137902535","display_name":"North Carolina State University","ror":"https://ror.org/04tj63d06","country_code":"US","type":"education","lineage":["https://openalex.org/I137902535"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dongkuan Xu","raw_affiliation_strings":["North Carolina State University, Raleigh, NC, 27695, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"North Carolina State University, Raleigh, NC, 27695, USA","institution_ids":["https://openalex.org/I137902535"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101633365","display_name":"Zhengang Li","orcid":"https://orcid.org/0000-0001-6644-4761"},"institutions":[{"id":"https://openalex.org/I12912129","display_name":"Northeastern University","ror":"https://ror.org/04t5xt781","country_code":"US","type":"education","lineage":["https://openalex.org/I12912129"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhengang Li","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Northeastern University, Boston, MA, 02115, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Northeastern University, Boston, MA, 02115, USA","institution_ids":["https://openalex.org/I12912129"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089349472","display_name":"Peiyan Dong","orcid":"https://orcid.org/0000-0001-5287-5149"},"institutions":[{"id":"https://openalex.org/I12912129","display_name":"Northeastern University","ror":"https://ror.org/04t5xt781","country_code":"US","type":"education","lineage":["https://openalex.org/I12912129"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Peiyan Dong","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Northeastern University, Boston, MA, 02115, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Northeastern University, Boston, MA, 02115, USA","institution_ids":["https://openalex.org/I12912129"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102082097","display_name":"Hao Tang","orcid":"https://orcid.org/0009-0001-6045-5934"},"institutions":[{"id":"https://openalex.org/I12912129","display_name":"Northeastern University","ror":"https://ror.org/04t5xt781","country_code":"US","type":"education","lineage":["https://openalex.org/I12912129"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hao Tang","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Northeastern University, Boston, MA, 02115, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Northeastern University, Boston, MA, 02115, USA","institution_ids":["https://openalex.org/I12912129"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100651384","display_name":"Yanzhi Wang","orcid":"https://orcid.org/0000-0002-3024-7990"},"institutions":[{"id":"https://openalex.org/I12912129","display_name":"Northeastern University","ror":"https://ror.org/04t5xt781","country_code":"US","type":"education","lineage":["https://openalex.org/I12912129"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yanzhi Wang","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Northeastern University, Boston, MA, 02115, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Northeastern University, Boston, MA, 02115, USA","institution_ids":["https://openalex.org/I12912129"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103853262","display_name":"Subhabrata Mukherjee","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Subhabrata Mukherjee","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, 98052, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, 98052, USA","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5078971265"],"corresponding_institution_ids":["https://openalex.org/I12912129"],"apc_list":{"value":2890,"currency":"EUR","value_usd":3690},"apc_paid":{"value":2890,"currency":"EUR","value_usd":3690},"fwci":3.6824,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.93135097,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":"133","issue":"9","first_page":"6170","last_page":"6186"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11992","display_name":"CCD and CMOS Imaging Sensors","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6699926257133484},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.58077073097229},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.560875654220581},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5548134446144104},{"id":"https://openalex.org/keywords/real-time-computing","display_name":"Real-time computing","score":0.48033514618873596},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.47080907225608826},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3887163996696472},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.13982701301574707},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.11279210448265076},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.07837998867034912},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.0680483877658844}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6699926257133484},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.58077073097229},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.560875654220581},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5548134446144104},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.48033514618873596},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.47080907225608826},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3887163996696472},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.13982701301574707},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.11279210448265076},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.07837998867034912},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0680483877658844}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s11263-025-02480-w","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11263-025-02480-w","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11263-025-02480-w.pdf","source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s11263-025-02480-w","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11263-025-02480-w","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11263-025-02480-w.pdf","source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4410745169.pdf","grobid_xml":"https://content.openalex.org/works/W4410745169.grobid-xml"},"referenced_works_count":52,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2108598243","https://openalex.org/W2138011018","https://openalex.org/W2533598788","https://openalex.org/W2883780447","https://openalex.org/W2946948417","https://openalex.org/W2962746461","https://openalex.org/W2962772649","https://openalex.org/W2963150697","https://openalex.org/W2963163009","https://openalex.org/W2963446712","https://openalex.org/W2963918968","https://openalex.org/W2990138404","https://openalex.org/W2992308087","https://openalex.org/W2998508940","https://openalex.org/W3035251378","https://openalex.org/W3035400692","https://openalex.org/W3035682985","https://openalex.org/W3094502228","https://openalex.org/W3096533519","https://openalex.org/W3109946440","https://openalex.org/W3116489684","https://openalex.org/W3118608800","https://openalex.org/W3121523901","https://openalex.org/W3131500599","https://openalex.org/W3136416617","https://openalex.org/W3137278571","https://openalex.org/W3138516171","https://openalex.org/W3151130473","https://openalex.org/W3165924482","https://openalex.org/W3168622113","https://openalex.org/W3169769781","https://openalex.org/W3170841864","https://openalex.org/W3171125843","https://openalex.org/W3184682079","https://openalex.org/W3202742610","https://openalex.org/W3204801262","https://openalex.org/W3211422090","https://openalex.org/W4214588794","https://openalex.org/W4214624153","https://openalex.org/W4214636423","https://openalex.org/W4226437046","https://openalex.org/W4281756776","https://openalex.org/W4307823382","https://openalex.org/W4312820606","https://openalex.org/W4365796094","https://openalex.org/W4387910965","https://openalex.org/W4393148276","https://openalex.org/W4393160572","https://openalex.org/W4409262775","https://openalex.org/W6779566397","https://openalex.org/W6803469875"],"related_works":["https://openalex.org/W2049261842","https://openalex.org/W2983245704","https://openalex.org/W2755342338","https://openalex.org/W4386041267","https://openalex.org/W2779427294","https://openalex.org/W2898395619","https://openalex.org/W2775347418","https://openalex.org/W2625805835","https://openalex.org/W2954284861","https://openalex.org/W3036465205"],"abstract_inverted_index":{"Abstract":[0],"Despite":[1],"their":[2],"impressive":[3],"performance":[4],"on":[5,39,62,124,192],"various":[6],"tasks,":[7],"vision":[8,15],"transformers":[9],"(ViTs)":[10],"are":[11],"heavy":[12],"for":[13,111],"mobile":[14],"applications.":[16],"Recent":[17],"works":[18],"have":[19],"proposed":[20],"combining":[21],"the":[22,53,104,109,115,129,142,158,161,165],"strengths":[23],"of":[24,46,55,97,128,131,145,160],"ViTs":[25,59,180],"and":[26,65,82,95,101,134,175,184,211],"convolutional":[27],"neural":[28,69],"networks":[29],"(CNNs)":[30],"to":[31,99,140,152,156],"build":[32],"lightweight":[33,179],"networks.":[34],"Still,":[35],"these":[36],"approaches":[37],"rely":[38],"hand-designed":[40],"architectures":[41],"with":[42,207],"a":[43,74,120,125],"pre-determined":[44],"number":[45],"parameters.":[47],"In":[48],"this":[49],"work,":[50],"we":[51,118],"address":[52],"challenge":[54],"finding":[56],"optimal":[57],"light-weight":[58],"given":[60],"constraints":[61],"model":[63,80,163],"size":[64],"computational":[66],"cost":[67],"using":[68],"architecture":[70],"search.":[71],"We":[72],"use":[73,119],"search":[75,116,147,166],"algorithm":[76],"that":[77],"considers":[78],"both":[79],"parameters":[81,174],"on-device":[83],"deployment":[84],"latency.":[85,106,214],"This":[86],"method":[87],"analyzes":[88],"network":[89,105],"properties,":[90],"hardware":[91],"memory":[92],"access":[93],"pattern,":[94],"degree":[96],"parallelism":[98],"directly":[100],"accurately":[102],"estimate":[103],"To":[107],"prevent":[108],"need":[110],"extensive":[112],"testing":[113,157],"during":[114,164],"process,":[117],"lookup":[121],"table":[122],"based":[123],"detailed":[126],"breakdown":[127],"speed":[130,159],"each":[132,146],"component":[133],"operation,":[135],"which":[136],"can":[137],"be":[138],"reused":[139],"evaluate":[141],"whole":[143,162],"latency":[144,186],"structure.":[148],"Our":[149],"approach":[150],"leads":[151],"improved":[153],"efficiency":[154],"compared":[155],"process.":[167],"Extensive":[168],"experiments":[169],"demonstrate":[170],"that,":[171],"under":[172],"similar":[173],"FLOPs,":[176],"our":[177],"searched":[178],"achieve":[181],"higher":[182,209],"accuracy":[183,210],"lower":[185,213],"than":[187],"state-of-the-art":[188],"models.":[189],"For":[190],"instance,":[191],"ImageNet-1K,":[193],"AutoViT_XXS":[194],"(71.3%":[195],"Top-1":[196,203],"accuracy,":[197,204],"10.2ms":[198],"latency)":[199,206],"outperforms":[200],"MobileViTv3_XXS":[201],"(71.0%":[202],"12.5ms":[205],"0.3%":[208],"2.3ms":[212]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":2}],"updated_date":"2026-06-13T06:13:01.061226","created_date":"2025-10-10T00:00:00"}
