{"id":"https://openalex.org/W4405754391","doi":"https://doi.org/10.1109/tmm.2024.3521785","title":"Unleash the Power of Vision-Language Models by Visual Attention Prompt and Multimodal Interaction","display_name":"Unleash the Power of Vision-Language Models by Visual Attention Prompt and Multimodal Interaction","publication_year":2024,"publication_date":"2024-12-24","ids":{"openalex":"https://openalex.org/W4405754391","doi":"https://doi.org/10.1109/tmm.2024.3521785"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2024.3521785","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2024.3521785","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Wenyao Zhang","orcid":"https://orcid.org/0009-0006-3090-255X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenyao Zhang","raw_affiliation_strings":["MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Letian Wu","orcid":"https://orcid.org/0009-0004-0438-7270"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Letian Wu","raw_affiliation_strings":["School of Automation, Southeast University, Nanjing, China","School of Automation, Southeast University, Nanjing, Jiangsu, China"],"affiliations":[{"raw_affiliation_string":"School of Automation, Southeast University, Nanjing, China","institution_ids":["https://openalex.org/I76569877"]},{"raw_affiliation_string":"School of Automation, Southeast University, Nanjing, Jiangsu, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040291509","display_name":"Zequn Zhang","orcid":"https://orcid.org/0000-0001-5566-761X"},"institutions":[{"id":"https://openalex.org/I4394709157","display_name":"Eastern Institute of Technology, Ningbo","ror":"https://ror.org/036mbz113","country_code":null,"type":"education","lineage":["https://openalex.org/I4394709157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zequn Zhang","raw_affiliation_strings":["Ningbo Institute of Digital Twin, Eastern Institute of Technology, Ningbo, China"],"affiliations":[{"raw_affiliation_string":"Ningbo Institute of Digital Twin, Eastern Institute of Technology, Ningbo, China","institution_ids":["https://openalex.org/I4394709157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100664162","display_name":"Tao Yu","orcid":"https://orcid.org/0000-0003-2550-5008"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Yu","raw_affiliation_strings":["Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China","Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, Anhui, China"],"affiliations":[{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, Anhui, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100704678","display_name":"Chao Ma","orcid":"https://orcid.org/0000-0002-8459-2845"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chao Ma","raw_affiliation_strings":["MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101828824","display_name":"Xin Jin","orcid":"https://orcid.org/0000-0002-1820-8358"},"institutions":[{"id":"https://openalex.org/I4394709157","display_name":"Eastern Institute of Technology, Ningbo","ror":"https://ror.org/036mbz113","country_code":null,"type":"education","lineage":["https://openalex.org/I4394709157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Jin","raw_affiliation_strings":["Ningbo Institute of Digital Twin, Eastern Institute of Technology, Ningbo, China"],"affiliations":[{"raw_affiliation_string":"Ningbo Institute of Digital Twin, Eastern Institute of Technology, Ningbo, China","institution_ids":["https://openalex.org/I4394709157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019708391","display_name":"Xiaokang Yang","orcid":"https://orcid.org/0000-0003-4029-3322"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaokang Yang","raw_affiliation_strings":["MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101739843","display_name":"Wenjun Zeng","orcid":"https://orcid.org/0000-0003-2531-3137"},"institutions":[{"id":"https://openalex.org/I4394709157","display_name":"Eastern Institute of Technology, Ningbo","ror":"https://ror.org/036mbz113","country_code":null,"type":"education","lineage":["https://openalex.org/I4394709157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenjun Zeng","raw_affiliation_strings":["Ningbo Institute of Digital Twin, Eastern Institute of Technology, Ningbo, China"],"affiliations":[{"raw_affiliation_string":"Ningbo Institute of Digital Twin, Eastern Institute of Technology, Ningbo, China","institution_ids":["https://openalex.org/I4394709157"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.245,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.56635861,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"27","issue":null,"first_page":"2399","last_page":"2411"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.916100025177002,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.916100025177002,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8554021120071411},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6252350211143494},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.47223684191703796},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.4353524148464203},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.42087244987487793},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.384024977684021},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.09829142689704895}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8554021120071411},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6252350211143494},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.47223684191703796},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4353524148464203},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42087244987487793},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.384024977684021},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.09829142689704895},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2024.3521785","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2024.3521785","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.46000000834465027,"display_name":"Reduced inequalities"}],"awards":[{"id":"https://openalex.org/G3527258163","display_name":null,"funder_award_id":"62302246","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":69,"referenced_works":["https://openalex.org/W12634471","https://openalex.org/W1861492603","https://openalex.org/W1977295328","https://openalex.org/W2017814585","https://openalex.org/W2047643928","https://openalex.org/W2108598243","https://openalex.org/W2138011018","https://openalex.org/W2155904486","https://openalex.org/W2277195237","https://openalex.org/W2533598788","https://openalex.org/W2832876791","https://openalex.org/W2896457183","https://openalex.org/W2964194231","https://openalex.org/W2970476646","https://openalex.org/W2998356391","https://openalex.org/W3034552520","https://openalex.org/W3034837210","https://openalex.org/W3035422918","https://openalex.org/W3090449556","https://openalex.org/W3185341429","https://openalex.org/W3198377975","https://openalex.org/W4205991051","https://openalex.org/W4226058394","https://openalex.org/W4229453513","https://openalex.org/W4289752563","https://openalex.org/W4295917908","https://openalex.org/W4312310776","https://openalex.org/W4312651322","https://openalex.org/W4312655527","https://openalex.org/W4312804044","https://openalex.org/W4312884055","https://openalex.org/W4313156423","https://openalex.org/W4313175608","https://openalex.org/W4364302332","https://openalex.org/W4368232796","https://openalex.org/W4382999123","https://openalex.org/W4386071547","https://openalex.org/W4386075788","https://openalex.org/W4386076084","https://openalex.org/W4386076681","https://openalex.org/W4386790226","https://openalex.org/W4387272106","https://openalex.org/W4390344271","https://openalex.org/W4390872773","https://openalex.org/W4390873714","https://openalex.org/W4391451889","https://openalex.org/W4393156100","https://openalex.org/W4402754025","https://openalex.org/W4402783842","https://openalex.org/W4403488721","https://openalex.org/W6600983433","https://openalex.org/W6638677478","https://openalex.org/W6753038380","https://openalex.org/W6759579507","https://openalex.org/W6766673545","https://openalex.org/W6766904570","https://openalex.org/W6778883912","https://openalex.org/W6790019176","https://openalex.org/W6791353385","https://openalex.org/W6796538260","https://openalex.org/W6796581206","https://openalex.org/W6796761347","https://openalex.org/W6798805250","https://openalex.org/W6803728867","https://openalex.org/W6811013733","https://openalex.org/W6844432065","https://openalex.org/W6846004651","https://openalex.org/W6849976536","https://openalex.org/W6853299294"],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Pre-trained":[0],"vision-language":[1,85],"models":[2],"(VLMs),":[3],"equipped":[4],"with":[5,139],"parameter-efficient":[6],"tuning":[7,48],"(PET)":[8],"methods":[9,49,62],"like":[10],"prompting,":[11],"have":[12],"shown":[13],"impressive":[14],"knowledge":[15],"transferability":[16],"on":[17,66,116,162],"new":[18,81],"downstream":[19,94],"tasks,":[20],"but":[21],"they":[22],"are":[23],"still":[24],"prone":[25],"to":[26,36,93,114],"be":[27],"limited":[28],"by":[29,96],"catastrophic":[30],"forgetting":[31],"and":[32,153,167],"overfitting":[33],"dilemma":[34],"due":[35],"large":[37],"gaps":[38],"among":[39],"tasks.":[40],"Furthermore,":[41],"the":[42,75,106,111,120,163],"underlying":[43],"physical":[44],"mechanisms":[45],"of":[46,119],"prompt-based":[47,82],"(especially":[50],"for":[51,71,84,156],"visual":[52,98,152],"prompting)":[53],"remain":[54],"largely":[55],"unexplored.":[56],"It":[57],"is":[58],"unclear":[59],"why":[60],"these":[61],"work":[63],"solely":[64],"based":[65],"learnable":[67],"parameters":[68,179],"as":[69],"prompts":[70,99,133],"adaptation.":[72],"To":[73],"address":[74],"above":[76],"challenges,":[77],"we":[78],"present":[79],"a":[80,136,147],"framework":[83,90],"models,":[86],"termed":[87],"Uni-prompt.":[88],"Our":[89],"transfers":[91],"VLMs":[92],"tasks":[95],"designing":[97],"from":[100],"an":[101],"attention":[102],"perspective":[103],"that":[104],"reduces":[105],"transfer/solution":[107],"space,":[108],"which":[109,144],"enables":[110],"vision":[112],"model":[113],"focus":[115],"task-relevant":[117],"regions":[118],"input":[121],"image":[122],"while":[123,175],"also":[124],"learning":[125,134],"task-specific":[126],"knowledge.":[127],"Additionally,":[128],"Uni-prompt":[129,173],"further":[130],"aligns":[131],"visual-text":[132],"through":[135],"pretext":[137],"task":[138,166],"masked":[140],"representation":[141],"modeling":[142],"interactions,":[143],"implicitly":[145],"learns":[146],"global":[148],"cross-modal":[149],"matching":[150],"between":[151],"language":[154],"concepts":[155],"consistency.":[157],"We":[158],"conduct":[159],"extensive":[160],"experiments":[161],"few-shot":[164],"classification":[165],"achieve":[168],"significant":[169],"improvement":[170],"using":[171],"our":[172],"method":[174],"requiring":[176],"minimal":[177],"extra":[178],"cost.":[180]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
