{"id":"https://openalex.org/W7164858584","doi":"https://doi.org/10.1145/3805622.3810717","title":"Reflective Cross-Granularity Grounding with Preference Optimization for Long Video Understanding","display_name":"Reflective Cross-Granularity Grounding with Preference Optimization for Long Video Understanding","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164858584","doi":"https://doi.org/10.1145/3805622.3810717"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810717","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810717","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810717","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103259925","display_name":"Wei Feng","orcid":"https://orcid.org/0009-0002-4796-4205"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Feng","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-4796-4205","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022927606","display_name":"Xin Wang","orcid":"https://orcid.org/0000-0002-0351-2939"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Wang","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-0351-2939","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100420416","display_name":"Hong Chen","orcid":"https://orcid.org/0000-0002-0943-2286"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hong Chen","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-0943-2286","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079124282","display_name":"Yu-Wei Zhan","orcid":"https://orcid.org/0000-0002-5822-5646"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu-Wei Zhan","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-5822-5646","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101644468","display_name":"Zihan Song","orcid":"https://orcid.org/0009-0009-1485-0536"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zihan Song","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-1485-0536","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101401100","display_name":"Bin Huang","orcid":"https://orcid.org/0009-0000-2504-3689"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bin Huang","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0000-2504-3689","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087247298","display_name":"Kecheng Zheng","orcid":"https://orcid.org/0000-0002-3450-400X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kecheng Zheng","raw_affiliation_strings":["Ant Research, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-3450-400X","affiliations":[{"raw_affiliation_string":"Ant Research, Hangzhou, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100339293","display_name":"Wenwu Zhu","orcid":"https://orcid.org/0000-0003-2236-9290"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenwu Zhu","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-2236-9290","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.94018953,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1413","last_page":"1422"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9793999791145325,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9793999791145325,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.002899999963119626,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.002400000113993883,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reflection","display_name":"Reflection (computer programming)","score":0.6620000004768372},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6478999853134155},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5593000054359436},{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.5289999842643738},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.5073999762535095},{"id":"https://openalex.org/keywords/video-processing","display_name":"Video processing","score":0.3822999894618988},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.3756999969482422}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7325999736785889},{"id":"https://openalex.org/C65682993","wikidata":"https://www.wikidata.org/wiki/Q1056451","display_name":"Reflection (computer programming)","level":2,"score":0.6620000004768372},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6478999853134155},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5593000054359436},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.5289999842643738},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.5073999762535095},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5022000074386597},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.3822999894618988},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3797000050544739},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3756999969482422},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.36070001125335693},{"id":"https://openalex.org/C117090137","wikidata":"https://www.wikidata.org/wiki/Q7927977","display_name":"Video post-processing","level":5,"score":0.35929998755455017},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.33570000529289246},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.31679999828338623},{"id":"https://openalex.org/C193081819","wikidata":"https://www.wikidata.org/wiki/Q4132092","display_name":"Video feedback","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C2988634675","wikidata":"https://www.wikidata.org/wiki/Q34508","display_name":"Video recording","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C166142869","wikidata":"https://www.wikidata.org/wiki/Q60061622","display_name":"Video production","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C2780139006","wikidata":"https://www.wikidata.org/wiki/Q1493902","display_name":"Key frame","level":3,"score":0.25529998540878296},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.2540999948978424}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810717","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810717","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810717","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810717","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.5067673921585083,"display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G3665546014","display_name":null,"funder_award_id":"No.62222209","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G937738700","display_name":null,"funder_award_id":"No.BNR2023TD03006","funder_id":"https://openalex.org/F4320329777","funder_display_name":"Beijing National Research Center For Information Science And Technology"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320329777","display_name":"Beijing National Research Center For Information Science And Technology","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2963916161","https://openalex.org/W2964089981","https://openalex.org/W2964220823","https://openalex.org/W3175961224","https://openalex.org/W3204588463","https://openalex.org/W3205786327","https://openalex.org/W4389519587","https://openalex.org/W4402702917","https://openalex.org/W4402727142","https://openalex.org/W4402727272","https://openalex.org/W4402754220","https://openalex.org/W4402754238","https://openalex.org/W4402917053","https://openalex.org/W4404781975","https://openalex.org/W4404784276","https://openalex.org/W4409367064","https://openalex.org/W4409369885","https://openalex.org/W4409370140","https://openalex.org/W4413144845","https://openalex.org/W4413145669","https://openalex.org/W4413147303","https://openalex.org/W4413147815","https://openalex.org/W4413156662","https://openalex.org/W4415798209","https://openalex.org/W4415798317","https://openalex.org/W7133182950","https://openalex.org/W7133250524"],"related_works":[],"abstract_inverted_index":{"Video":[0,259],"Large":[1,260],"Language":[2,261],"Models":[3],"(video":[4],"LLMs)":[5],"have":[6],"demonstrated":[7],"remarkable":[8],"capabilities":[9,82],"in":[10,51],"video":[11,16,32,66,75,85,107,115,125,130,138,145,156,172,178,241,246,265],"understanding":[12,23,93],"tasks,":[13],"such":[14],"as":[15],"question":[17,242],"answering":[18,243],"and":[19,47,101,201,244],"temporal":[20,245],"localization.":[21],"However,":[22],"long":[24,38,74,106,129,137,240,264],"videos":[25],"still":[26],"remains":[27],"a":[28,60,118],"significant":[29],"challenge.":[30],"Existing":[31],"LLMs":[33],"adopt":[34],"uni-granularity":[35],"tokens":[36,161],"for":[37,65,105,239,263],"videos,":[39],"failing":[40],"to":[41,71,193,204,206,232],"simultaneously":[42],"understand":[43,166],"both":[44],"high-level":[45,84],"semantics":[46,86],"low-level":[48],"visual":[49],"details":[50],"videos.":[52],"To":[53],"tackle":[54],"this":[55],"problem,":[56],"we":[57,96],"propose":[58,97],"ReCrossVLLM,":[59],"reflective":[61],"cross-granularity":[62,228],"grounding":[63,100,112,209,221,235,247],"framework":[64,254],"LLM":[67,116,146,173],"with":[68,117,139,148,158,211,225],"preference":[69,229],"optimization":[70,230],"collaboratively":[72],"achieve":[73],"understanding,":[76,87],"which":[77],"not":[78],"only":[79],"retains":[80],"the":[81,91,98,110,114,123,128,136,149,154,171,175,181,190,195,198,207,216,219,258],"of":[83,135,197],"but":[88],"also":[89],"strengthens":[90],"fine-grained":[92,150,167,191],"abilities.":[94],"Specifically,":[95],"coarse-to-fine":[99,111,208,220],"fine-to-coarse":[102,186],"reflection":[103,187,212],"strategies":[104],"understanding.":[108,266],"In":[109,169],"strategy,":[113],"coarse-grained":[119],"module":[120,151,192],"first":[121],"locates":[122,174],"key":[124,155,177],"segments":[126,157],"from":[127],"by":[131],"tackling":[132],"massive":[133],"frames":[134],"fewer":[140],"per-frame":[141,160],"tokens.":[142],"And":[143],"then":[144],"adapted":[147],"further":[152,233],"analyzes":[153],"more":[159],"so":[162],"that":[163,250],"it":[164],"can":[165,255],"information.":[168],"case":[170],"wrong":[176],"segments,":[179],"during":[180,215],"inference":[182],"stage,":[183,218],"our":[184,226,251],"designed":[185],"strategy":[188,210,222,231],"instructs":[189],"reflect":[194],"effectiveness":[196],"locating":[199],"result":[200],"decide":[202],"whether":[203],"return":[205],"feedback.":[213],"Additionally,":[214],"training":[217],"is":[223],"optimized":[224],"proposed":[227,252],"improve":[234,257],"efficiency.":[236],"Extensive":[237],"experiments":[238],"tasks":[248],"demonstrate":[249],"ReCrossVLLM":[253],"significantly":[256],"Model":[262]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
