{"id":"https://openalex.org/W4414892879","doi":"https://doi.org/10.1109/tip.2026.3680029","title":"SynPO: Synergizing Descriptiveness and Preference Optimization for Video Detailed Captioning","display_name":"SynPO: Synergizing Descriptiveness and Preference Optimization for Video Detailed Captioning","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W4414892879","doi":"https://doi.org/10.1109/tip.2026.3680029","pmid":"https://pubmed.ncbi.nlm.nih.gov/41945808"},"language":"en","primary_location":{"id":"doi:10.1109/tip.2026.3680029","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tip.2026.3680029","pdf_url":null,"source":{"id":"https://openalex.org/S4210173141","display_name":"IEEE Transactions on Image Processing","issn_l":"1057-7149","issn":["1057-7149","1941-0042"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Image Processing","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref","datacite","pubmed"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2506.00835","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058458910","display_name":"Jisheng Dang","orcid":"https://orcid.org/0000-0002-5378-6225"},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jisheng Dang","raw_affiliation_strings":["School of Information Science and Engineering, Lanzhou University, Lanzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information Science and Engineering, Lanzhou University, Lanzhou, China","institution_ids":["https://openalex.org/I76214153"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103120964","display_name":"Yizhou Zhang","orcid":"https://orcid.org/0000-0002-6349-7937"},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yizhou Zhang","raw_affiliation_strings":["School of Information Science and Engineering, Lanzhou University, Lanzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information Science and Engineering, Lanzhou University, Lanzhou, China","institution_ids":["https://openalex.org/I76214153"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Hao Ye","orcid":"https://orcid.org/0009-0001-5336-6810"},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Ye","raw_affiliation_strings":["School of Information Science and Engineering, Lanzhou University, Lanzhou, China"],"raw_orcid":"https://orcid.org/0009-0001-5336-6810","affiliations":[{"raw_affiliation_string":"School of Information Science and Engineering, Lanzhou University, Lanzhou, China","institution_ids":["https://openalex.org/I76214153"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047351419","display_name":"Teng Wang","orcid":"https://orcid.org/0000-0003-2331-3619"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Teng Wang","raw_affiliation_strings":["University of Hong Kong, Hong Kong, China"],"raw_orcid":"https://orcid.org/0000-0003-2331-3619","affiliations":[{"raw_affiliation_string":"University of Hong Kong, Hong Kong, China","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yulan Guo","orcid":"https://orcid.org/0000-0003-0952-476X"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yulan Guo","raw_affiliation_strings":["School of Electronics and Communication Engineering, Sun Yat-sen University, Shenzhen Campus, Shenzhen, China"],"raw_orcid":"https://orcid.org/0000-0003-0952-476X","affiliations":[{"raw_affiliation_string":"School of Electronics and Communication Engineering, Sun Yat-sen University, Shenzhen Campus, Shenzhen, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":null,"display_name":"Bin Hu","orcid":"https://orcid.org/0000-0003-3514-5413"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bin Hu","raw_affiliation_strings":["School of Medical Technology, Beijing Institute of Technology, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-3514-5413","affiliations":[{"raw_affiliation_string":"School of Medical Technology, Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5058458910"],"corresponding_institution_ids":["https://openalex.org/I76214153"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0037849,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"35","issue":null,"first_page":"3780","last_page":"3792"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9876999855041504,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9120000004768372},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7192000150680542},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.6769999861717224},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4909999966621399},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.47209998965263367},{"id":"https://openalex.org/keywords/preference-learning","display_name":"Preference learning","score":0.4562999904155731},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.351500004529953}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9120000004768372},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7965999841690063},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7192000150680542},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.6769999861717224},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6107000112533569},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5198000073432922},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4909999966621399},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.47209998965263367},{"id":"https://openalex.org/C181204326","wikidata":"https://www.wikidata.org/wiki/Q7239820","display_name":"Preference learning","level":3,"score":0.4562999904155731},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.351500004529953},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34360000491142273},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.3199000060558319},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.3125},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2818000018596649},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.27950000762939453},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.2639999985694885},{"id":"https://openalex.org/C2987595161","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Optimization algorithm","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2549000084400177}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1109/tip.2026.3680029","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tip.2026.3680029","pdf_url":null,"source":{"id":"https://openalex.org/S4210173141","display_name":"IEEE Transactions on Image Processing","issn_l":"1057-7149","issn":["1057-7149","1941-0042"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Image Processing","raw_type":"journal-article"},{"id":"pmid:41945808","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41945808","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on image processing : a publication of the IEEE Signal Processing Society","raw_type":null},{"id":"pmh:oai:arXiv.org:2506.00835","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.00835","pdf_url":"https://arxiv.org/pdf/2506.00835","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2506.00835","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.00835","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2506.00835","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.00835","pdf_url":"https://arxiv.org/pdf/2506.00835","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Fine-grained":[0],"video":[1,11,20,43,167],"captioning":[2,168],"aims":[3],"to":[4,17,33,50,120],"generate":[5],"detailed,":[6],"temporally":[7],"coherent":[8],"descriptions":[9],"of":[10,37,69,123],"content.":[12],"However,":[13],"existing":[14],"methods":[15],"struggle":[16],"capture":[18],"subtle":[19],"dynamics":[21],"and":[22,84,103,131,156,170],"rich":[23],"detailed":[24],"information.":[25],"In":[26],"this":[27],"paper,":[28],"we":[29,56,88],"leverage":[30],"preference":[31,62],"learning":[32],"enhance":[34],"the":[35,66,112,116,124,137,140],"performance":[36,164],"vision-language":[38],"models":[39],"(VLM)":[40],"in":[41,165],"fine-grained":[42,166],"captioning,":[44],"while":[45],"mitigating":[46],"several":[47],"limitations":[48],"inherent":[49],"Direct":[51],"Preference":[52,91],"Optimization":[53,92],"(DPO).":[54],"First,":[55],"propose":[57,89],"a":[58,94],"pipeline":[59,150],"for":[60,139],"constructing":[61],"pairs":[63],"that":[64,160],"leverages":[65],"intrinsic":[67],"properties":[68],"VLMs":[70],"along":[71],"with":[72],"partial":[73],"assistance":[74],"from":[75,110],"large":[76],"language":[77,118],"models,":[78],"achieving":[79],"an":[80],"balance":[81],"between":[82],"cost":[83],"data":[85,148],"quality.":[86],"Then,":[87],"Synergistic":[90],"(SynPO),":[93],"novel":[95],"optimization":[96,125],"method":[97,162],"offering":[98],"significant":[99],"advantages":[100],"over":[101],"DPO":[102],"its":[104],"variants.":[105],"SynPO":[106],"prevents":[107],"negative":[108],"pReferences":[109],"dominating":[111],"training,":[113],"explicitly":[114],"preserves":[115],"model's":[117],"capability":[119],"avoid":[121],"deviation":[122],"objective,":[126],"thus":[127],"obtains":[128],"high-quality":[129],"captions":[130],"improves":[132],"training":[133],"efficiency":[134],"by":[135],"eliminating":[136],"need":[138],"reference":[141],"model.":[142],"We":[143],"extensively":[144],"evaluate":[145],"our":[146,161],"proposed":[147],"construction":[149],"across":[151],"three":[152],"models:":[153],"AuroraCap,":[154],"LLaVA1.6-7B-Video":[155],"InterVL2-8B.":[157],"Results":[158],"demonstrate":[159],"improve":[163],"significantly":[169],"consistenly.":[171],"Source":[172],"code":[173],"is":[174],"available":[175],"at":[176],"https://github.com/longmalongma/SynPO.":[177]},"counts_by_year":[],"updated_date":"2026-04-25T08:17:42.794288","created_date":"2025-10-10T00:00:00"}
