{"id":"https://openalex.org/W7154597991","doi":"https://doi.org/10.48550/arxiv.2604.13586","title":"Efficient Multi-View 3D Object Detection by Dynamic Token Selection and Fine-Tuning","display_name":"Efficient Multi-View 3D Object Detection by Dynamic Token Selection and Fine-Tuning","publication_year":2026,"publication_date":"2026-04-15","ids":{"openalex":"https://openalex.org/W7154597991","doi":"https://doi.org/10.48550/arxiv.2604.13586"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.13586","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13586","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.13586","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033168344","display_name":"Danish Nazir","orcid":"https://orcid.org/0000-0001-6364-8427"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Nazir, Danish","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114657298","display_name":"Antoine Hanna-Asaad","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hanna-Asaad, Antoine","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119288520","display_name":"Lucas G\u00f6rnhardt","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"G\u00f6rnhardt, Lucas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018524519","display_name":"Jan Piewek","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Piewek, Jan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070844255","display_name":"T. Bagdonat","orcid":"https://orcid.org/0000-0002-4008-6721"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bagdonat, Thorsten","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5002593702","display_name":"Tim Fingscheidt","orcid":"https://orcid.org/0000-0002-8895-5041"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fingscheidt, Tim","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5033168344"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9606000185012817,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9606000185012817,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.003800000064074993,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.0035000001080334187,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.8374999761581421},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.663100004196167},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5112000107765198},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.4569000005722046},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4259999990463257},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.41019999980926514},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4043999910354614},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4020000100135803}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.8374999761581421},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7397000193595886},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.663100004196167},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5882999897003174},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5112000107765198},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.4569000005722046},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4259999990463257},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.424699991941452},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.41019999980926514},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4043999910354614},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4020000100135803},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.34940001368522644},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.34929999709129333},{"id":"https://openalex.org/C115067241","wikidata":"https://www.wikidata.org/wiki/Q1639854","display_name":"Token passing","level":3,"score":0.3441999852657318},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.33709999918937683},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.30070000886917114},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.3003999888896942},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2922999858856201},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.29109999537467957},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.13586","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13586","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.13586","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13586","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Existing":[0],"multi-view":[1,31,74,98,153],"three-dimensional":[2],"(3D)":[3],"object":[4,34,76,100,155],"detection":[5,35,77,156,194],"approaches":[6,157],"widely":[7],"adopt":[8],"large-scale":[9,148],"pre-trained":[10],"vision":[11],"transformer":[12],"(ViT)-based":[13],"foundation":[14],"models":[15],"as":[16],"backbones,":[17],"being":[18],"computationally":[19],"complex.":[20],"To":[21],"address":[22],"this":[23,80],"problem,":[24],"current":[25],"state-of-the-art":[26],"(SOTA)":[27],"\\texttt{ToC3D}":[28],"for":[29,72,93],"efficient":[30],"ViT-based":[32],"3D":[33,75,99,154],"employs":[36],"ego-motion-based":[37],"relevant":[38],"token":[39,51,86,91,109],"selection.":[40],"However,":[41],"there":[42],"are":[43],"two":[44],"key":[45],"limitations:":[46],"(1)":[47],"The":[48],"fixed":[49],"layer-individual":[50],"selection":[52,92,110],"ratios":[53],"limit":[54],"computational":[55,164],"efficiency":[56],"during":[57],"both":[58],"training":[59],"and":[60,192],"inference.":[61],"(2)":[62],"Full":[63],"end-to-end":[64],"retraining":[65],"of":[66,132],"the":[67,73,112,125,130,147],"ViT":[68,94,113],"backbone":[69],"is":[70],"required":[71],"method.":[78],"In":[79],"work,":[81],"we":[82,116],"propose":[83],"an":[84,174],"image":[85],"compensator":[87],"combined":[88],"with":[89],"a":[90,118],"backbones":[95],"to":[96,141,202],"accelerate":[97],"detection.":[101],"Unlike":[102],"\\texttt{ToC3D},":[103],"our":[104,160],"approach":[105],"enables":[106],"dynamic":[107],"layer-wise":[108],"within":[111],"backbone.":[114],"Furthermore,":[115],"introduce":[117],"parameter-efficient":[119],"fine-tuning":[120],"strategy,":[121],"which":[122],"trains":[123],"only":[124,142],"proposed":[126,161],"modules,":[127],"thereby":[128],"reducing":[129],"number":[131],"fine-tuned":[133],"parameters":[134],"from":[135],"more":[136],"than":[137],"$300$":[138],"million":[139],"(M)":[140],"$1.6$":[143],"M.":[144],"Experiments":[145],"on":[146],"NuScenes":[149,193],"dataset":[150],"across":[151],"three":[152],"demonstrate":[158],"that":[159],"method":[162],"decreases":[163],"complexity":[165],"(GFLOPs)":[166],"by":[167,177,187,196],"$48\\%$":[168],"...":[169,179,189,198],"$55\\%$,":[170],"inference":[171],"latency":[172],"(on":[173],"\\texttt{NVIDIA-GV100}":[175],"GPU)":[176],"$9\\%$":[178],"$25\\%$,":[180],"while":[181],"still":[182],"improving":[183],"mean":[184],"average":[185],"precision":[186],"$1.0\\%$":[188],"$2.8\\%$":[190],"absolute":[191,200],"score":[195],"$0.4\\%$":[197],"$1.2\\%$":[199],"compared":[201],"so-far":[203],"SOTA":[204],"\\texttt{ToC3D}.":[205]},"counts_by_year":[],"updated_date":"2026-04-17T06:04:52.305304","created_date":"2026-04-17T00:00:00"}
