{"id":"https://openalex.org/W4415537847","doi":"https://doi.org/10.1145/3746027.3762058","title":"MGVC: MLLM-Guided Video Captioning for the IntentVC Challenge","display_name":"MGVC: MLLM-Guided Video Captioning for the IntentVC Challenge","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415537847","doi":"https://doi.org/10.1145/3746027.3762058"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3762058","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3762058","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746027.3762058","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5022345139","display_name":"Zhipeng Yu","orcid":"https://orcid.org/0000-0003-1132-859X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhipeng Yu","raw_affiliation_strings":["SEECE, UCAS, Beijing, China"],"affiliations":[{"raw_affiliation_string":"SEECE, UCAS, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089092566","display_name":"Qianqian Xu","orcid":"https://orcid.org/0000-0002-3512-7277"},"institutions":[{"id":"https://openalex.org/I4210101410","display_name":"International Centre for Theoretical Physics Asia-Pacific","ror":"https://ror.org/01z2px678","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210101410","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qianqian Xu","raw_affiliation_strings":["IIP, ICT, CAS, Beijing, China"],"affiliations":[{"raw_affiliation_string":"IIP, ICT, CAS, Beijing, China","institution_ids":["https://openalex.org/I4210101410"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053213796","display_name":"Yangbangyan Jiang","orcid":"https://orcid.org/0000-0002-0148-8306"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yangbangyan Jiang","raw_affiliation_strings":["SCST, UCAS, Beijing, China"],"affiliations":[{"raw_affiliation_string":"SCST, UCAS, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113528935","display_name":"Pengming Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pinci Yang","raw_affiliation_strings":["SEECE, UCAS, Beijing, China"],"affiliations":[{"raw_affiliation_string":"SEECE, UCAS, Beijing, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5028597017","display_name":"Qingming Huang","orcid":"https://orcid.org/0000-0001-7542-296X"},"institutions":[{"id":"https://openalex.org/I4210101410","display_name":"International Centre for Theoretical Physics Asia-Pacific","ror":"https://ror.org/01z2px678","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210101410","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qingming Huang","raw_affiliation_strings":["SCST, UCAS IIP, ICT, CAS, Beijing, China"],"affiliations":[{"raw_affiliation_string":"SCST, UCAS IIP, ICT, CAS, Beijing, China","institution_ids":["https://openalex.org/I4210101410"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5022345139"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.29675709,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"13815","last_page":"13821"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9922000169754028,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9866999983787537},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.6728000044822693},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6402999758720398},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.5809999704360962},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.531000018119812},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5138000249862671},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.45730000734329224}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9866999983787537},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8871999979019165},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.6728000044822693},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6402999758720398},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.5809999704360962},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5410000085830688},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5385000109672546},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.531000018119812},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5138000249862671},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.45730000734329224},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3919999897480011},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3743000030517578},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.34220001101493835},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3172999918460846},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3163999915122986},{"id":"https://openalex.org/C2778493491","wikidata":"https://www.wikidata.org/wiki/Q7449072","display_name":"Semantic matching","level":3,"score":0.30630001425743103},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2989000082015991},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2800000011920929},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2793999910354614},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.25780001282691956},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2563999891281128}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3762058","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3762058","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3746027.3762058","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3762058","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4734482802","display_name":null,"funder_award_id":"2023M743441","funder_id":"https://openalex.org/F4320321543","funder_display_name":"China Postdoctoral Science Foundation"},{"id":"https://openalex.org/G7452466908","display_name":null,"funder_award_id":"62236008,U21B2038,U23B2051,62471013,62406305,62476068","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320321543","display_name":"China Postdoctoral Science Foundation","ror":"https://ror.org/0426zh255"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":5,"referenced_works":["https://openalex.org/W2425121537","https://openalex.org/W4312563428","https://openalex.org/W4312956471","https://openalex.org/W4402727764","https://openalex.org/W4413145408"],"related_works":[],"abstract_inverted_index":{"Recently,":[0],"with":[1,141],"the":[2,37,61,103,142,159,166],"rapid":[3],"advancement":[4],"of":[5,170],"multimodal":[6],"large":[7],"language":[8],"models":[9],"(MLLMs),":[10],"intent-oriented":[11],"video":[12,30,58],"captioning":[13,31,59],"has":[14],"received":[15],"increasing":[16],"attention":[17],"due":[18,35],"to":[19,36,72],"its":[20],"potential":[21],"for":[22,39,55,87],"controllable":[23,57],"and":[24,43,91,135,168],"grounded":[25],"visual":[26],"understanding.":[27],"Fine-grained":[28],"localized":[29],"presents":[32],"unique":[33],"challenges":[34],"need":[38],"controllability,":[40],"object":[41],"grounding,":[42],"temporal":[44],"precision.":[45],"In":[46],"this":[47],"paper,":[48],"we":[49,118],"propose":[50],"MGVC,":[51],"a":[52,69,96],"two-stage":[53],"framework":[54],"intention-oriented":[56],"in":[60,158],"IntentVC":[62,104,160],"2025":[63,161],"Challenge.":[64],"Our":[65],"pipeline":[66],"first":[67],"leverages":[68],"fine-tuned":[70],"MLLM":[71,86],"generate":[73],"diverse":[74],"preliminary":[75],"captions.":[76,114],"These":[77],"candidate":[78,113],"captions":[79],"are":[80],"then":[81],"refined":[82],"by":[83],"another":[84],"finetuned":[85,101],"further":[88,100],"semantic":[89],"alignment":[90,145],"stylistic":[92],"coherence.":[93],"We":[94],"introduce":[95],"video-text":[97],"matching":[98],"module,":[99],"on":[102,128],"dataset.":[105],"This":[106],"module":[107],"will":[108],"filter":[109],"out":[110],"semantically":[111],"misaligned":[112],"For":[115],"caption":[116,124,140],"selection,":[117],"train":[119],"category-specific":[120],"regressors":[121],"that":[122],"predict":[123],"quality":[125],"scores":[126],"based":[127],"VTM":[129],"similarity,":[130],"textual":[131],"features,":[132],"intra-caption":[133],"BLEU,":[134],"CLIP-based":[136],"retrieval":[137],"correlations.":[138],"The":[139],"highest":[143],"predicted":[144],"score":[146],"is":[147],"chosen":[148],"as":[149],"final":[150],"output.":[151],"Finally,":[152],"our":[153,171],"method":[154],"achieves":[155],"1st":[156],"place":[157],"Grand":[162],"Challenge,":[163],"which":[164],"demonstrates":[165],"effectiveness":[167],"generalization":[169],"proposed":[172],"method.":[173]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-25T00:00:00"}
