{"id":"https://openalex.org/W4411635418","doi":"https://doi.org/10.1145/3731715.3733389","title":"MirrorDiff: Learning Mirror Diffusion for Image Captioning via Regeneration","display_name":"MirrorDiff: Learning Mirror Diffusion for Image Captioning via Regeneration","publication_year":2025,"publication_date":"2025-06-25","ids":{"openalex":"https://openalex.org/W4411635418","doi":"https://doi.org/10.1145/3731715.3733389"},"language":"en","primary_location":{"id":"doi:10.1145/3731715.3733389","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3731715.3733389","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112578806","display_name":"Junbo Wang","orcid":"https://orcid.org/0009-0006-9955-7838"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junbo Wang","raw_affiliation_strings":["School of Software, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":"https://orcid.org/0009-0006-9955-7838","affiliations":[{"raw_affiliation_string":"School of Software, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019642277","display_name":"Liangyu Fu","orcid":"https://orcid.org/0009-0006-6433-7528"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liangyu Fu","raw_affiliation_strings":["School of Software, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":"https://orcid.org/0009-0006-6433-7528","affiliations":[{"raw_affiliation_string":"School of Software, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099093361","display_name":"Yining Zhu","orcid":"https://orcid.org/0009-0006-6003-7739"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yining Zhu","raw_affiliation_strings":["School of Computer Science, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":"https://orcid.org/0009-0006-6003-7739","affiliations":[{"raw_affiliation_string":"School of Computer Science, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084965704","display_name":"Qiangguo Jin","orcid":"https://orcid.org/0000-0002-1781-1067"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiangguo Jin","raw_affiliation_strings":["School of Software, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":"https://orcid.org/0000-0002-1781-1067","affiliations":[{"raw_affiliation_string":"School of Software, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014269015","display_name":"Hongsong Wang","orcid":"https://orcid.org/0000-0002-9464-1778"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongsong Wang","raw_affiliation_strings":["School of Computer Science and Engineering, Southeast University, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0002-9464-1778","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Southeast University, Nanjing, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100697205","display_name":"Yuke Li","orcid":"https://orcid.org/0000-0001-9836-4600"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuke Li","raw_affiliation_strings":["School of Software, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":"https://orcid.org/0000-0001-9836-4600","affiliations":[{"raw_affiliation_string":"School of Software, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108497986","display_name":"Xuecheng Wu","orcid":"https://orcid.org/0000-0002-6244-0269"},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuecheng Wu","raw_affiliation_strings":["School of Computer Science and Technology, Xi'an Jiaotong University, Xi'an, China"],"raw_orcid":"https://orcid.org/0000-0002-6244-0269","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Xi'an Jiaotong University, Xi'an, China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100668926","display_name":"Kun Hu","orcid":"https://orcid.org/0000-0002-6891-8059"},"institutions":[{"id":"https://openalex.org/I12079687","display_name":"Edith Cowan University","ror":"https://ror.org/05jhnwe22","country_code":"AU","type":"education","lineage":["https://openalex.org/I12079687"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Kun Hu","raw_affiliation_strings":["School of Science, Edith Cowan University, Perth, Australia"],"raw_orcid":"https://orcid.org/0000-0002-6891-8059","affiliations":[{"raw_affiliation_string":"School of Science, Edith Cowan University, Perth, Australia","institution_ids":["https://openalex.org/I12079687"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.10968615,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1331","last_page":"1339"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.850202202796936},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6583872437477112},{"id":"https://openalex.org/keywords/regeneration","display_name":"Regeneration (biology)","score":0.6448370814323425},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.6132593750953674},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5626421570777893},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.49532976746559143},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.41397911310195923},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.12255141139030457},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.05670434236526489},{"id":"https://openalex.org/keywords/cell-biology","display_name":"Cell biology","score":0.04447615146636963}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.850202202796936},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6583872437477112},{"id":"https://openalex.org/C171056886","wikidata":"https://www.wikidata.org/wiki/Q193119","display_name":"Regeneration (biology)","level":2,"score":0.6448370814323425},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.6132593750953674},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5626421570777893},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.49532976746559143},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41397911310195923},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.12255141139030457},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.05670434236526489},{"id":"https://openalex.org/C95444343","wikidata":"https://www.wikidata.org/wiki/Q7141","display_name":"Cell biology","level":1,"score":0.04447615146636963},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3731715.3733389","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3731715.3733389","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Climate action","score":0.6399999856948853,"id":"https://metadata.un.org/sdg/13"}],"awards":[{"id":"https://openalex.org/G3878614066","display_name":null,"funder_award_id":"D5000250044, D5000250060","funder_id":"https://openalex.org/F4320323817","funder_display_name":"Universitas Brawijaya"},{"id":"https://openalex.org/G909162720","display_name":null,"funder_award_id":"62201460, 62302093","funder_id":"https://openalex.org/F4320323817","funder_display_name":"Universitas Brawijaya"}],"funders":[{"id":"https://openalex.org/F4320323817","display_name":"Universitas Brawijaya","ror":"https://ror.org/01wk3d929"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1593271688","https://openalex.org/W1861492603","https://openalex.org/W1895577753","https://openalex.org/W1905882502","https://openalex.org/W1947481528","https://openalex.org/W1956340063","https://openalex.org/W2302086703","https://openalex.org/W2552161745","https://openalex.org/W2600463316","https://openalex.org/W2745461083","https://openalex.org/W2750779823","https://openalex.org/W2890531016","https://openalex.org/W2963101956","https://openalex.org/W2963175879","https://openalex.org/W2979747405","https://openalex.org/W3034655362","https://openalex.org/W3035284526","https://openalex.org/W3106925514","https://openalex.org/W3173220247","https://openalex.org/W3175095612","https://openalex.org/W3196122027","https://openalex.org/W4281690218","https://openalex.org/W4282968790","https://openalex.org/W4304098906","https://openalex.org/W4312933868","https://openalex.org/W4313131769","https://openalex.org/W4386072307","https://openalex.org/W4403757510","https://openalex.org/W6600234944","https://openalex.org/W6838815585"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3164229987","https://openalex.org/W3215212336","https://openalex.org/W4290852288","https://openalex.org/W3217388757","https://openalex.org/W3122720459","https://openalex.org/W4298897568","https://openalex.org/W1938708284","https://openalex.org/W4380190185"],"abstract_inverted_index":{"Recently,":[0],"diffusion":[1,122],"models":[2],"which":[3,71,174],"have":[4,12],"achieved":[5],"promising":[6],"progress":[7],"in":[8,49,156],"text-to-image":[9],"generation":[10,69],"generally":[11,15],"also":[13],"been":[14],"explored":[16],"for":[17],"image":[18,23,32,81,85,96,103,112,149,166,181,197],"captioning.":[19],"However,":[20],"these":[21],"diffusion-based":[22,57,140,218],"captioning":[24,167,182],"methods":[25,236],"usually":[26],"suffer":[27],"from":[28,163],"semantic":[29,63],"inconsistency":[30],"between":[31,194],"content":[33],"and":[34,98,105,114,184,198,225],"textual":[35,106,117],"description,":[36],"thus":[37],"producing":[38],"lagging":[39],"results":[40],"compared":[41],"with":[42,65,135],"Auto-Regressive":[43],"(AR)":[44],"ones.":[45],"To":[46,129],"this":[47,50],"end,":[48],"paper,":[51],"we":[52],"propose":[53],"a":[54,66,74,83,120,139,157,171],"novel":[55],"dual":[56],"framework":[58,173],"namely":[59],"MirrorDiff,":[60],"to":[61,101,124,145,221],"achieve":[62],"consistency":[64],"symmetric":[67],"image-to-text-to-image":[68],"model,":[70],"acts":[72],"like":[73],"mirror":[75],"that":[76,210],"maps":[77],"the":[78,87,111,115,132,136,147,152,187,191,195,199,205,234],"original":[79],"input":[80,137,148,196],"into":[82,119,178],"regenerated":[84,200],"via":[86,190],"generated":[88,188],"caption.":[89],"Specifically,":[90],"it":[91],"first":[92],"utilizes":[93],"both":[94],"pre-trained":[95],"encoder":[97,100],"text":[99],"obtain":[102],"representation":[104,107,113,118],"respectively,":[108],"then":[109],"forwards":[110],"noisy":[116],"continuous":[121],"model":[123],"output":[125],"an":[126],"intermediate":[127,133,153],"sentence.":[128],"semantically":[130],"align":[131],"sentence":[134,189],"image,":[138],"visual":[141,159,192],"regenerator":[142],"is":[143,170],"employed":[144],"regenerate":[146],"conditioned":[150],"on":[151,204,223,229,238],"sentence,":[154],"resulting":[155],"proposed":[158],"regeneration":[160],"loss.":[161],"Different":[162],"most":[164],"existing":[165],"methods,":[168,183,219],"MirrorDiff":[169],"plug-and-play":[172],"can":[175],"be":[176],"plugged":[177],"many":[179],"previous":[180],"further":[185],"evaluate":[186],"similarity":[193],"image.":[201],"Extensive":[202],"experiments":[203],"MS":[206],"COCO":[207],"dataset":[208],"show":[209],"our":[211],"method":[212],"achieves":[213,226],"obvious":[214],"improvements":[215],"over":[216,233],"state-of-the-art":[217],"up":[220],"127.9":[222],"CIDEr,":[224],"competitive":[227],"performance":[228],"multiple":[230],"evaluation":[231],"metrics":[232],"auto-regressive":[235],"trained":[237],"larger-scale":[239],"datasets.":[240]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
