{"id":"https://openalex.org/W4414360705","doi":"https://doi.org/10.24963/ijcai.2025/98","title":"Diff-LMM: Diffusion Teacher-Guided Spatio-Temporal Perception for Video Large Multimodal Models","display_name":"Diff-LMM: Diffusion Teacher-Guided Spatio-Temporal Perception for Video Large Multimodal Models","publication_year":2025,"publication_date":"2025-09-01","ids":{"openalex":"https://openalex.org/W4414360705","doi":"https://doi.org/10.24963/ijcai.2025/98"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2025/98","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/98","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058458910","display_name":"Jisheng Dang","orcid":"https://orcid.org/0000-0002-5378-6225"},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]},{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]},{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN","SG"],"is_corresponding":true,"raw_author_name":"Jisheng Dang","raw_affiliation_strings":["Lanzhou University, Gansu, China","National University of Singapore, Singapore","Sun Yat-sen University, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"Lanzhou University, Gansu, China","institution_ids":["https://openalex.org/I76214153"]},{"raw_affiliation_string":"National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]},{"raw_affiliation_string":"Sun Yat-sen University, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021963647","display_name":"Ligen Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ligen Chen","raw_affiliation_strings":["Sun Yat-sen University, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019956300","display_name":"Jun Wu","orcid":"https://orcid.org/0000-0002-2231-5607"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingze Wu","raw_affiliation_strings":["Sun Yat-sen University, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032333673","display_name":"Ronghao Lin","orcid":"https://orcid.org/0000-0003-4530-4529"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ronghao Lin","raw_affiliation_strings":["Sun Yat-sen University, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101160189","display_name":"Bimei Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]},{"id":"https://openalex.org/I159948400","display_name":"Jinan University","ror":"https://ror.org/02xe5ns62","country_code":"CN","type":"education","lineage":["https://openalex.org/I159948400"]}],"countries":["CN","SG"],"is_corresponding":false,"raw_author_name":"Bimei Wang","raw_affiliation_strings":["Jinan University, Guangdong, China","National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Jinan University, Guangdong, China","institution_ids":["https://openalex.org/I159948400"]},{"raw_affiliation_string":"National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100377603","display_name":"Yun Wang","orcid":"https://orcid.org/0000-0001-8384-6981"},"institutions":[{"id":"https://openalex.org/I168719708","display_name":"City University of Hong Kong","ror":"https://ror.org/03q8dnn23","country_code":"HK","type":"education","lineage":["https://openalex.org/I168719708"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yun Wang","raw_affiliation_strings":["City University of Hong Kong, China"],"affiliations":[{"raw_affiliation_string":"City University of Hong Kong, China","institution_ids":["https://openalex.org/I168719708"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100652853","display_name":"Liting Wang","orcid":"https://orcid.org/0000-0002-1215-9495"},"institutions":[{"id":"https://openalex.org/I68986083","display_name":"Northwest Normal University","ror":"https://ror.org/00gx3j908","country_code":"CN","type":"education","lineage":["https://openalex.org/I68986083"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liting Wang","raw_affiliation_strings":["Northwest Normal University, Gansu, China"],"affiliations":[{"raw_affiliation_string":"Northwest Normal University, Gansu, China","institution_ids":["https://openalex.org/I68986083"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101581501","display_name":"Nannan Zhu","orcid":"https://orcid.org/0000-0003-4038-3053"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Nannan Zhu","raw_affiliation_strings":["Sun Yat-sen University, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047351419","display_name":"Teng Wang","orcid":"https://orcid.org/0000-0003-2331-3619"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Teng Wang","raw_affiliation_strings":["University of Hong Kong, China"],"affiliations":[{"raw_affiliation_string":"University of Hong Kong, China","institution_ids":["https://openalex.org/I889458895"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5058458910"],"corresponding_institution_ids":["https://openalex.org/I157773358","https://openalex.org/I165932596","https://openalex.org/I76214153"],"apc_list":null,"apc_paid":null,"fwci":2.7712,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.91932609,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"873","last_page":"881"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9825000166893005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/hallucinating","display_name":"Hallucinating","score":0.8804000020027161},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.7092999815940857},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6189000010490417},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5715000033378601},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5253999829292297},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.48350000381469727},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3601999878883362},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.35850000381469727}],"concepts":[{"id":"https://openalex.org/C2911011789","wikidata":"https://www.wikidata.org/wiki/Q130741","display_name":"Hallucinating","level":2,"score":0.8804000020027161},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7900999784469604},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.7092999815940857},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6583999991416931},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6189000010490417},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5715000033378601},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5253999829292297},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.48350000381469727},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.439300000667572},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3970000147819519},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3601999878883362},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.35850000381469727},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.35580000281333923},{"id":"https://openalex.org/C101814296","wikidata":"https://www.wikidata.org/wiki/Q5439685","display_name":"Feature model","level":3,"score":0.3262999951839447},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32269999384880066},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.2896000146865845},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.27959999442100525},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2696000039577484},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2651999890804291},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.2648000121116638},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.2612999975681305}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963/ijcai.2025/98","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/98","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Dynamic":[0],"spatio-temporal":[1],"understanding":[2,36,128],"is":[3],"essential":[4],"for":[5,139],"video-based":[6],"multimodal":[7,67,141],"tasks,":[8],"yet":[9],"existing":[10],"methods":[11],"often":[12],"struggle":[13],"to":[14,44,93,106],"capture":[15,94],"fine-grained":[16,49],"temporal":[17],"and":[18,84],"spatial":[19,95],"relationships":[20],"in":[21,34,144],"long":[22],"videos.":[23],"Current":[24],"approaches":[25],"primarily":[26],"rely":[27],"on":[28,117],"pre-trained":[29,112],"CLIP":[30],"encoders,":[31],"which":[32],"excel":[33],"semantic":[35],"but":[37],"lack":[38],"spatially-aware":[39],"visual":[40,79],"context.":[41],"This":[42],"leads":[43],"hallucinated":[45],"results":[46,131],"when":[47],"interpreting":[48],"objects":[50],"or":[51],"scenes.":[52],"To":[53],"address":[54],"these":[55],"limitations,":[56],"we":[57,77,100],"propose":[58],"a":[59,102,136],"novel":[60],"framework":[61],"that":[62],"integrates":[63],"diffusion":[64,72,113,133],"models":[65,134,143],"into":[66],"video":[68,127,142],"models.":[69,114],"By":[70],"employing":[71],"encoders":[73],"at":[74],"intermediate":[75],"layers,":[76],"enhance":[78],"representations":[80],"through":[81],"feature":[82,109],"alignment":[83,104],"knowledge":[85],"distillation":[86],"losses,":[87],"significantly":[88],"improving":[89],"the":[90],"model's":[91],"ability":[92],"patterns":[96],"over":[97],"time.":[98],"Additionally,":[99],"introduce":[101],"multi-level":[103],"strategy":[105],"learn":[107],"robust":[108],"correspondence":[110],"from":[111],"Extensive":[115],"experiments":[116],"benchmark":[118],"datasets":[119],"demonstrate":[120],"our":[121],"approach's":[122],"state-of-the-art":[123],"performance":[124],"across":[125],"multiple":[126],"tasks.":[129],"These":[130],"establish":[132],"as":[135],"powerful":[137],"tool":[138],"enhancing":[140],"complex,":[145],"dynamic":[146],"scenarios.":[147]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
