{"id":"https://openalex.org/W4367046808","doi":"https://doi.org/10.1145/3543507.3583232","title":"CapEnrich: Enriching Caption Semantics for Web Images via Cross-modal Pre-trained Knowledge","display_name":"CapEnrich: Enriching Caption Semantics for Web Images via Cross-modal Pre-trained Knowledge","publication_year":2023,"publication_date":"2023-04-26","ids":{"openalex":"https://openalex.org/W4367046808","doi":"https://doi.org/10.1145/3543507.3583232"},"language":"en","primary_location":{"id":"doi:10.1145/3543507.3583232","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3543507.3583232","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Web Conference 2023","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100599428","display_name":"Linli Yao","orcid":"https://orcid.org/0000-0002-9809-8864"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Linli Yao","raw_affiliation_strings":["School of Information, Renmin University of China, China"],"affiliations":[{"raw_affiliation_string":"School of Information, Renmin University of China, China","institution_ids":["https://openalex.org/I78988378"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012716072","display_name":"Weijing Chen","orcid":"https://orcid.org/0000-0001-9371-5256"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weijing Chen","raw_affiliation_strings":["School of Information, Renmin University of China, China"],"affiliations":[{"raw_affiliation_string":"School of Information, Renmin University of China, China","institution_ids":["https://openalex.org/I78988378"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009985839","display_name":"Qin Jin","orcid":"https://orcid.org/0000-0001-6486-6020"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qin Jin","raw_affiliation_strings":["School of Information, Renmin University of China, China"],"affiliations":[{"raw_affiliation_string":"School of Information, Renmin University of China, China","institution_ids":["https://openalex.org/I78988378"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100599428"],"corresponding_institution_ids":["https://openalex.org/I78988378"],"apc_list":null,"apc_paid":null,"fwci":0.4913,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.63509483,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"2392","last_page":"2401"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9879999756813049,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9710999727249146,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8011034727096558},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6700810194015503},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.6309525370597839},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4630373418331146},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.39999955892562866},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3564136326313019},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.3266619145870209},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.12788206338882446}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8011034727096558},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6700810194015503},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.6309525370597839},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4630373418331146},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39999955892562866},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3564136326313019},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.3266619145870209},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.12788206338882446},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3543507.3583232","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3543507.3583232","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Web Conference 2023","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.75}],"awards":[{"id":"https://openalex.org/G8390650537","display_name":null,"funder_award_id":"62072462","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1593271688","https://openalex.org/W1895577753","https://openalex.org/W1905882502","https://openalex.org/W1956340063","https://openalex.org/W2185175083","https://openalex.org/W2506483933","https://openalex.org/W2525579820","https://openalex.org/W2604178507","https://openalex.org/W2754927243","https://openalex.org/W2798959609","https://openalex.org/W2896348597","https://openalex.org/W2963084599","https://openalex.org/W2963170456","https://openalex.org/W2963448089","https://openalex.org/W2981852735","https://openalex.org/W2987118624","https://openalex.org/W2989489923","https://openalex.org/W3035485997","https://openalex.org/W3090449556","https://openalex.org/W3091588028","https://openalex.org/W3107015116","https://openalex.org/W3173937618","https://openalex.org/W3174770825","https://openalex.org/W3176641147","https://openalex.org/W3184784418","https://openalex.org/W3205647238","https://openalex.org/W4205991051","https://openalex.org/W4288089799","https://openalex.org/W6778883912"],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2379392295","https://openalex.org/W3160965418","https://openalex.org/W613940353","https://openalex.org/W2320915480","https://openalex.org/W2362990116","https://openalex.org/W2381300099","https://openalex.org/W2714992399","https://openalex.org/W2383812217","https://openalex.org/W2326515389"],"abstract_inverted_index":{"Automatically":[0],"generating":[1,30],"textual":[2,54,89,151],"descriptions":[3,49,113],"for":[4,44,56,97,222],"massive":[5],"unlabeled":[6],"images":[7],"on":[8,132],"the":[9,27,62,88,110,158,165,174,215],"web":[10,15,58,223],"can":[11,180],"greatly":[12],"benefit":[13],"realistic":[14],"applications,":[16],"e.g.":[17],"multimodal":[18],"retrieval":[19],"and":[20,141,162,217],"recommendation.":[21],"However,":[22],"existing":[23],"models":[24,69,147,179],"suffer":[25],"from":[26],"problem":[28],"of":[29,65,91,177,219],"\u201cover-generic\u201d":[31],"descriptions,":[32],"such":[33],"as":[34,183,185],"their":[35,81],"tendency":[36],"to":[37,51,85,108,126,144,148,170,187,203],"generate":[38,149],"repetitive":[39],"sentences":[40,221],"with":[41,114],"common":[42],"concepts":[43],"different":[45],"images.":[46,59,224],"These":[47],"generic":[48,111],"fail":[50],"provide":[52],"sufficient":[53],"semantics":[55,90],"ever-changing":[57],"Inspired":[60],"by":[61],"recent":[63],"success":[64],"Vision-Language":[66],"Pre-training":[67],"(VLP)":[68],"that":[70,210],"learn":[71],"diverse":[72,189],"image-text":[73],"concept":[74],"alignment":[75],"during":[76],"pretraining,":[77],"we":[78,101,119,134,156],"explore":[79],"leveraging":[80],"cross-modal":[82],"pre-trained":[83],"knowledge":[84,176],"automatically":[86],"enrich":[87],"image":[92,112],"descriptions.":[93],"With":[94],"no":[95],"need":[96],"additional":[98],"human":[99],"annotations,":[100],"propose":[102,121],"a":[103],"plug-and-play":[104],"framework,":[105],"i.e":[106],"CapEnrich,":[107],"complement":[109],"more":[115,150],"semantic":[116],"details.":[117,152],"Specifically,":[118],"first":[120],"an":[122],"automatic":[123],"data-building":[124],"strategy":[125],"get":[127],"desired":[128],"training":[129],"sentences,":[130],"based":[131],"which":[133,168],"then":[135],"adopt":[136],"prompting":[137],"strategies,":[138],"i.e.":[139],"learnable":[140,154],"template":[142],"prompts,":[143],"incentivize":[145],"VLP":[146,160,178],"For":[153],"templates,":[155],"fix":[157],"whole":[159],"model":[161],"only":[163,193],"tune":[164],"prompt":[166],"vectors,":[167],"leads":[169],"two":[171],"advantages:":[172],"1)":[173],"pre-training":[175],"be":[181],"reserved":[182],"much":[184],"possible":[186],"describe":[188],"visual":[190],"concepts;":[191],"2)":[192],"lightweight":[194],"trainable":[195],"parameters":[196],"are":[197],"required,":[198],"so":[199],"it":[200],"is":[201,227],"friendly":[202],"low":[204],"data":[205],"resources.":[206],"Extensive":[207],"experiments":[208],"show":[209],"our":[211],"method":[212],"significantly":[213],"improves":[214],"descriptiveness":[216],"diversity":[218],"generated":[220],"The":[225],"code":[226],"available":[228],"at":[229],"https://github.com/yaolinli/CapEnrich.":[230]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
