{"id":"https://openalex.org/W4401075326","doi":"https://doi.org/10.1145/3682067","title":"Efficiently Gluing Pre-Trained Language and Vision Models for Image Captioning","display_name":"Efficiently Gluing Pre-Trained Language and Vision Models for Image Captioning","publication_year":2024,"publication_date":"2024-07-29","ids":{"openalex":"https://openalex.org/W4401075326","doi":"https://doi.org/10.1145/3682067"},"language":"en","primary_location":{"id":"doi:10.1145/3682067","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3682067","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3682067","source":{"id":"https://openalex.org/S2492086750","display_name":"ACM Transactions on Intelligent Systems and Technology","issn_l":"2157-6904","issn":["2157-6904","2157-6912"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Intelligent Systems and Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3682067","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015177224","display_name":"Peipei Song","orcid":"https://orcid.org/0000-0001-6764-3375"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Peipei Song","raw_affiliation_strings":["University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0001-6764-3375","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025436570","display_name":"Yuanen Zhou","orcid":"https://orcid.org/0000-0002-4986-3611"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuanen Zhou","raw_affiliation_strings":["Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-4986-3611","affiliations":[{"raw_affiliation_string":"Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034737032","display_name":"Xun Yang","orcid":"https://orcid.org/0000-0003-0201-1638"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xun Yang","raw_affiliation_strings":["University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0003-0201-1638","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101424059","display_name":"Daqing Liu","orcid":"https://orcid.org/0000-0002-8286-0105"},"institutions":[{"id":"https://openalex.org/I4210103986","display_name":"Jingdong (China)","ror":"https://ror.org/01dkjkq64","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210103986"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Daqing Liu","raw_affiliation_strings":["JD Explore Academy, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-8286-0105","affiliations":[{"raw_affiliation_string":"JD Explore Academy, Beijing, China","institution_ids":["https://openalex.org/I4210103986"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040235604","display_name":"Zhenzhen Hu","orcid":"https://orcid.org/0000-0003-1042-8361"},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenzhen Hu","raw_affiliation_strings":["Hefei University of Technology, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0003-1042-8361","affiliations":[{"raw_affiliation_string":"Hefei University of Technology, Hefei, China","institution_ids":["https://openalex.org/I16365422"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101952061","display_name":"Depeng Wang","orcid":"https://orcid.org/0000-0001-6786-0732"},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Depeng Wang","raw_affiliation_strings":["Hefei University of Technology, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0001-6786-0732","affiliations":[{"raw_affiliation_string":"Hefei University of Technology, Hefei, China","institution_ids":["https://openalex.org/I16365422"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100377147","display_name":"Meng Wang","orcid":"https://orcid.org/0000-0002-3094-7735"},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Meng Wang","raw_affiliation_strings":["Hefei University of Technology and Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-3094-7735","affiliations":[{"raw_affiliation_string":"Hefei University of Technology and Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China","institution_ids":["https://openalex.org/I16365422"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5015177224"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":2.1427,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.88583984,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"15","issue":"6","first_page":"1","last_page":"16"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9937999844551086,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9731000065803528,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9352849721908569},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.9090571403503418},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6210842728614807},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5016236305236816},{"id":"https://openalex.org/keywords/semantic-gap","display_name":"Semantic gap","score":0.5009081363677979},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4635647237300873},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4489459693431854},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4467344284057617},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4406716525554657},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.37095993757247925},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.34069231152534485},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.32294726371765137},{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.11319327354431152}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9352849721908569},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9090571403503418},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6210842728614807},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5016236305236816},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.5009081363677979},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4635647237300873},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4489459693431854},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4467344284057617},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4406716525554657},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.37095993757247925},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.34069231152534485},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32294726371765137},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.11319327354431152},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3682067","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3682067","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3682067","source":{"id":"https://openalex.org/S2492086750","display_name":"ACM Transactions on Intelligent Systems and Technology","issn_l":"2157-6904","issn":["2157-6904","2157-6912"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Intelligent Systems and Technology","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3682067","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3682067","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3682067","source":{"id":"https://openalex.org/S2492086750","display_name":"ACM Transactions on Intelligent Systems and Technology","issn_l":"2157-6904","issn":["2157-6904","2157-6912"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Intelligent Systems and Technology","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.75,"display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G186231419","display_name":null,"funder_award_id":"U22A2094, 62272435, 62020106007, 62172138, and 61932009","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8102618002","display_name":null,"funder_award_id":"JZ2024HGTG0310","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4401075326.pdf"},"referenced_works_count":51,"referenced_works":["https://openalex.org/W1773149199","https://openalex.org/W1905882502","https://openalex.org/W1956340063","https://openalex.org/W2064675550","https://openalex.org/W2163605009","https://openalex.org/W2186222003","https://openalex.org/W2302086703","https://openalex.org/W2463955103","https://openalex.org/W2575842049","https://openalex.org/W2745461083","https://openalex.org/W2766623491","https://openalex.org/W2795151422","https://openalex.org/W2887585070","https://openalex.org/W2890531016","https://openalex.org/W2963084599","https://openalex.org/W2963101956","https://openalex.org/W2966715458","https://openalex.org/W2968101724","https://openalex.org/W2986670728","https://openalex.org/W3034655362","https://openalex.org/W3035284526","https://openalex.org/W3035309251","https://openalex.org/W3035323998","https://openalex.org/W3035497460","https://openalex.org/W3091588028","https://openalex.org/W3105136412","https://openalex.org/W3152619510","https://openalex.org/W3167939936","https://openalex.org/W3173220247","https://openalex.org/W3177174258","https://openalex.org/W3204447181","https://openalex.org/W4205991051","https://openalex.org/W4234552385","https://openalex.org/W4282920689","https://openalex.org/W4285200050","https://openalex.org/W4287113019","https://openalex.org/W4304014869","https://openalex.org/W4323864040","https://openalex.org/W4385245566","https://openalex.org/W4386076004","https://openalex.org/W4386391014","https://openalex.org/W4386607611","https://openalex.org/W4387969035","https://openalex.org/W4388796930","https://openalex.org/W4389523981","https://openalex.org/W4390195594","https://openalex.org/W4391454521","https://openalex.org/W4399055279","https://openalex.org/W4402776467","https://openalex.org/W6600234944","https://openalex.org/W6739901393"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4388893791","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2949522393"],"abstract_inverted_index":{"Vision-and-language":[0],"pre-training":[1],"models":[2,43,48,259],"have":[3,69],"achieved":[4],"impressive":[5],"performance":[6,214,228,248],"for":[7,54,194],"image":[8,55,127,139,164,177],"captioning.":[9,56],"But":[10],"most":[11],"of":[12,18,39,116,131,134,229,255],"them":[13,53,143],"are":[14,58,260],"trained":[15],"with":[16,149],"millions":[17],"paired":[19],"image-text":[20],"data":[21],"and":[22,26,45,50,66,80,94,104,110,141,171,188,205,258],"require":[23],"huge":[24],"memory":[25],"computing":[27],"overhead.":[28],"To":[29,97],"alleviate":[30],"this,":[31],"we":[32,100,124,158,224],"try":[33],"to":[34,107,147,152,160,183,197,215],"stand":[35],"on":[36,176,202,231],"the":[37,81,85,91,121,155,162,180,185,195,203,221,232,243],"shoulders":[38],"large-scale":[40],"pre-trained":[41,46,92],"language":[42,65,93],"(PLM)":[44],"vision":[47,67,95],"(PVM)":[49],"efficiently":[51],"connect":[52],"There":[57],"two":[59],"major":[60],"challenges:":[61],"one":[62],"is":[63,83],"that":[64,84,209,241],"modalities":[68],"different":[70],"semantic":[71,86,156,186,189],"granularity":[72,187],"(e.g.,":[73],"a":[74,102,114,129,226,252],"noun":[75],"may":[76],"cover":[77],"many":[78],"pixels),":[79],"other":[82],"gap":[87,190],"still":[88],"exists":[89],"between":[90],"models.":[96],"this":[98],"end,":[99],"design":[101],"lightweight":[103],"efficient":[105],"connector":[106,181],"glue":[108],"PVM":[109],"PLM,":[111],"which":[112],"holds":[113],"criterion":[115],"selection-then-transformation":[117],".":[118,264],"Specifically,":[119],"in":[120,251],"selection":[122],"phase,":[123],"treat":[125],"each":[126],"as":[128],"set":[130],"patches":[132,140,165],"instead":[133],"pixels.":[135],"We":[136],"select":[137],"salient":[138],"cluster":[142],"into":[144,166],"visual":[145],"regions":[146],"align":[148],"text.":[150],"Then,":[151],"effectively":[153],"reduce":[154],"gap,":[157],"propose":[159],"map":[161],"selected":[163],"text":[167],"space":[168],"through":[169],"spatial":[170],"channel":[172],"transformations.":[173],"With":[174],"training":[175,220],"captioning":[178],"datasets,":[179],"learns":[182],"bridge":[184],"via":[191],"backpropagation,":[192],"preparing":[193],"PLM":[196,244],"generate":[198],"descriptions.":[199],"Experimental":[200],"results":[201],"MSCOCO":[204,233],"Flickr30k":[206],"datasets":[207],"demonstrate":[208],"our":[210,238],"method":[211],"yields":[212],"comparable":[213],"existing":[216],"works.":[217],"By":[218],"solely":[219],"small":[222],"connector,":[223],"achieve":[225],"CIDEr":[227,253],"132.2%":[230],"Karpathy":[234],"test":[235],"split.":[236],"Moreover,":[237],"findings":[239],"reveal":[240],"fine-tuning":[242],"can":[245],"further":[246],"enhance":[247],"potential,":[249],"resulting":[250],"score":[254],"140.6%.":[256],"Code":[257],"available":[261],"at":[262],"https://github.com/YuanEZhou/PrefixCap":[263]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":8}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
