{"id":"https://openalex.org/W4415539222","doi":"https://doi.org/10.1145/3746027.3755119","title":"Fast3D: Accelerating 3D Multi-modal Large Language Models for Efficient 3D Scene Understanding","display_name":"Fast3D: Accelerating 3D Multi-modal Large Language Models for Efficient 3D Scene Understanding","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415539222","doi":"https://doi.org/10.1145/3746027.3755119"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755119","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755119","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072986859","display_name":"Wencan Huang","orcid":"https://orcid.org/0000-0002-1555-3674"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wencan Huang","raw_affiliation_strings":["Wangxuan Institute of Computer Technology, Peking University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-1555-3674","affiliations":[{"raw_affiliation_string":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078220957","display_name":"Daizong Liu","orcid":"https://orcid.org/0000-0001-8179-4508"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Daizong Liu","raw_affiliation_strings":["Wangxuan Institute of Computer Technology, Peking University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-8179-4508","affiliations":[{"raw_affiliation_string":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5059045087","display_name":"Wei Hu","orcid":"https://orcid.org/0000-0002-9860-0922"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Hu","raw_affiliation_strings":["Wangxuan Institute of Computer Technology, Peking University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-9860-0922","affiliations":[{"raw_affiliation_string":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5072986859"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.28812137,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"10935","last_page":"10944"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.8424000144004822},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.6780999898910522},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.6322000026702881},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.6060000061988831},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4514000117778778},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.40130001306533813},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.36809998750686646}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.8424000144004822},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8007000088691711},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.6780999898910522},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.6322000026702881},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.6060000061988831},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.507099986076355},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4514000117778778},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.40470001101493835},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.40130001306533813},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.36809998750686646},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.36730000376701355},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.35440000891685486},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.32760000228881836},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.31450000405311584},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.3091000020503998},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2849999964237213},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.2818000018596649}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755119","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755119","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":10,"referenced_works":["https://openalex.org/W1956340063","https://openalex.org/W3107521863","https://openalex.org/W3204647170","https://openalex.org/W4312872526","https://openalex.org/W4390873017","https://openalex.org/W4395481535","https://openalex.org/W4402716272","https://openalex.org/W4402754134","https://openalex.org/W4403791765","https://openalex.org/W4403939369"],"related_works":[],"abstract_inverted_index":{"While":[0],"3D":[1,35,52,78,101,117],"Multi-modal":[2],"Large":[3],"Language":[4],"Models":[5],"(MLLMs)":[6],"demonstrate":[7],"remarkable":[8],"scene":[9,36],"understanding":[10],"capabilities,":[11],"their":[12],"practical":[13],"deployment":[14],"faces":[15],"critical":[16,70],"challenges":[17],"due":[18,57],"to":[19,51,58,82,134],"computational":[20],"inefficiency.":[21],"The":[22],"key":[23],"bottleneck":[24],"stems":[25],"from":[26],"processing":[27],"excessive":[28],"object-centric":[29],"visual":[30,39,112,155,204],"tokens":[31,99],"required":[32],"for":[33,96,116,149],"comprehensive":[34],"representation.":[37],"Although":[38],"token":[40,62,79,113,146,156,162,205],"pruning":[41,114,151,171,206],"has":[42],"shown":[43],"promise":[44],"in":[45,61,76,85,100],"accelerating":[46],"2D":[47,86],"MLLMs,":[48],"its":[49],"applicability":[50],"domains":[53],"remains":[54],"largely":[55],"unexplored":[56],"fundamental":[59],"disparities":[60],"structures.":[63],"In":[64],"this":[65],"paper,":[66],"we":[67,107],"reveal":[68],"two":[69,120,180],"insights:":[71],"(1)":[72,123],"Significant":[73],"redundancy":[74,84],"exists":[75],"object-level":[77],"representations,":[80],"analogous":[81],"patch-level":[83],"systems;":[87],"(2)":[88,153],"Global":[89,124],"attention":[90,138],"patterns":[91],"exhibit":[92],"strong":[93],"predictive":[94],"power":[95],"identifying":[97],"non-essential":[98],"contexts.":[102],"Building":[103],"on":[104,174],"these":[105,179],"observations,":[106],"propose":[108],"Fast3D,":[109,200],"a":[110,129],"plug-and-play":[111],"framework":[115],"MLLMs":[118],"featuring":[119],"technical":[121],"innovations:":[122],"Attention":[125],"Prediction":[126],"(GAP),":[127],"where":[128],"lightweight":[130],"neural":[131],"network":[132],"learns":[133],"predict":[135],"the":[136,141,185,188,197],"global":[137],"distributions":[139],"of":[140,178,187,199],"target":[142,189],"model,":[143],"enabling":[144],"efficient":[145],"importance":[147],"estimation":[148],"precise":[150],"guidance;":[152],"Sample-Adaptive":[154],"Pruning":[157],"(SAP),":[158],"which":[159],"introduces":[160],"dynamic":[161],"budgets":[163],"through":[164],"attention-based":[165],"complexity":[166],"assessment,":[167],"automatically":[168],"adjusting":[169],"layer-wise":[170],"ratios":[172],"based":[173],"input":[175],"characteristics.":[176],"Both":[177],"techniques":[181],"operate":[182],"without":[183],"modifying":[184],"parameters":[186],"model.":[190],"Extensive":[191],"evaluations":[192],"across":[193],"five":[194],"benchmarks":[195],"validate":[196],"effectiveness":[198],"particularly":[201],"under":[202],"high":[203],"ratios.":[207],"Code":[208],"is":[209],"available":[210],"at":[211],"https://github.com/wencan25/Fast3D.":[212]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-25T00:00:00"}
