{"id":"https://openalex.org/W4415709227","doi":"https://doi.org/10.1109/icme59968.2025.11209160","title":"Perspective Makes Perfect: Prompt-tuning Vision-Language Models for Action Recognition with Diversified Multi-Modal Observation","display_name":"Perspective Makes Perfect: Prompt-tuning Vision-Language Models for Action Recognition with Diversified Multi-Modal Observation","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415709227","doi":"https://doi.org/10.1109/icme59968.2025.11209160"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11209160","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209160","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013248684","display_name":"Hailun Zhang","orcid":"https://orcid.org/0000-0001-9818-3332"},"institutions":[{"id":"https://openalex.org/I4210125143","display_name":"Chengdu University","ror":"https://ror.org/034z67559","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210125143"]},{"id":"https://openalex.org/I24185976","display_name":"Sichuan University","ror":"https://ror.org/011ashp19","country_code":"CN","type":"education","lineage":["https://openalex.org/I24185976"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hailun Zhang","raw_affiliation_strings":["Sichuan University,College of Computer Science,Chengdu,China"],"affiliations":[{"raw_affiliation_string":"Sichuan University,College of Computer Science,Chengdu,China","institution_ids":["https://openalex.org/I4210125143","https://openalex.org/I24185976"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085914001","display_name":"Qijun Zhao","orcid":"https://orcid.org/0000-0003-4651-7163"},"institutions":[{"id":"https://openalex.org/I4210125143","display_name":"Chengdu University","ror":"https://ror.org/034z67559","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210125143"]},{"id":"https://openalex.org/I24185976","display_name":"Sichuan University","ror":"https://ror.org/011ashp19","country_code":"CN","type":"education","lineage":["https://openalex.org/I24185976"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qijun Zhao","raw_affiliation_strings":["Sichuan University,College of Computer Science,Chengdu,China"],"affiliations":[{"raw_affiliation_string":"Sichuan University,College of Computer Science,Chengdu,China","institution_ids":["https://openalex.org/I4210125143","https://openalex.org/I24185976"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038895104","display_name":"Zhen Zhai","orcid":null},"institutions":[{"id":"https://openalex.org/I24185976","display_name":"Sichuan University","ror":"https://ror.org/011ashp19","country_code":"CN","type":"education","lineage":["https://openalex.org/I24185976"]},{"id":"https://openalex.org/I4210125143","display_name":"Chengdu University","ror":"https://ror.org/034z67559","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210125143"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhen Zhai","raw_affiliation_strings":["Sichuan University,College of Computer Science,Chengdu,China"],"affiliations":[{"raw_affiliation_string":"Sichuan University,College of Computer Science,Chengdu,China","institution_ids":["https://openalex.org/I4210125143","https://openalex.org/I24185976"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100768845","display_name":"Xinrui Wang","orcid":"https://orcid.org/0000-0001-7003-4917"},"institutions":[{"id":"https://openalex.org/I24185976","display_name":"Sichuan University","ror":"https://ror.org/011ashp19","country_code":"CN","type":"education","lineage":["https://openalex.org/I24185976"]},{"id":"https://openalex.org/I4210125143","display_name":"Chengdu University","ror":"https://ror.org/034z67559","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210125143"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinrui Wang","raw_affiliation_strings":["Sichuan University,College of Computer Science,Chengdu,China"],"affiliations":[{"raw_affiliation_string":"Sichuan University,College of Computer Science,Chengdu,China","institution_ids":["https://openalex.org/I4210125143","https://openalex.org/I24185976"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5013248684"],"corresponding_institution_ids":["https://openalex.org/I24185976","https://openalex.org/I4210125143"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31247905,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7056999802589417,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7056999802589417,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.23849999904632568,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.010900000110268593,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.6366000175476074},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.6068999767303467},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.508400022983551},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4982999861240387},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.4569999873638153},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.44440001249313354},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.43790000677108765}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6929000020027161},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.6366000175476074},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.6068999767303467},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6007999777793884},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.508400022983551},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4982999861240387},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.4569999873638153},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.44440001249313354},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.43790000677108765},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4350999891757965},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.414000004529953},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.37770000100135803},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.33489999175071716},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.30550000071525574},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2874000072479248},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2685999870300293},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.2563000023365021},{"id":"https://openalex.org/C2781316041","wikidata":"https://www.wikidata.org/wiki/Q1230584","display_name":"Diversity (politics)","level":2,"score":0.2542000114917755},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11209160","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209160","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W2126579184","https://openalex.org/W3025800305","https://openalex.org/W3126721948","https://openalex.org/W3145385912","https://openalex.org/W3185341429","https://openalex.org/W3214993537","https://openalex.org/W4214612132","https://openalex.org/W4214747681","https://openalex.org/W4221142658","https://openalex.org/W4312480274","https://openalex.org/W4312558481","https://openalex.org/W4312614039","https://openalex.org/W4386065852","https://openalex.org/W4386072441","https://openalex.org/W4386075799","https://openalex.org/W4386159789","https://openalex.org/W4388854793","https://openalex.org/W4390874575","https://openalex.org/W4402657844","https://openalex.org/W4403237206"],"related_works":[],"abstract_inverted_index":{"Observing":[0],"key":[1],"visual":[2,92,101],"cues":[3],"and":[4,40,72,115,122,135],"reasoning":[5],"multi-modal":[6,120],"semantics":[7,74,105],"from":[8,34,53,142],"diversified":[9,54,100,111],"perspectives":[10,112,144],"are":[11,113],"critical":[12],"in":[13,76,145],"human":[14,49],"beings\u2019":[15],"perception":[16],"of":[17,37,87,131,139],"actions.":[18],"However,":[19],"existing":[20],"action":[21,67],"recognition":[22,68],"methods,":[23],"including":[24],"ones":[25],"based":[26],"on":[27],"prompt-tuning":[28,64],"image-based":[29],"vision-language":[30],"(I-VL)":[31],"models,":[32],"suffer":[33],"the":[35,129,137],"under-exploration":[36],"video":[38,66],"contents":[39],"textual":[41,104],"semantics.":[42,93],"In":[43],"this":[44],"paper,":[45],"inspired":[46],"by":[47,69],"how":[48],"beings":[50],"reason":[51],"actions":[52],"perspectives,":[55],"we":[56,80,118],"propose":[57],"Diversified":[58],"Multi-Modal":[59],"Observation":[60],"(DMMO)":[61],"to":[62],"improve":[63],"for":[65,90,103],"reducing":[70],"redundancy":[71],"enhancing":[73],"diversity":[75],"various":[77],"perspectives.":[78],"Firstly,":[79],"select":[81],"a":[82],"few":[83],"informative":[84],"frames":[85],"instead":[86],"dense":[88],"sampling":[89],"clear":[91],"Then,":[94],"subject-context":[95],"segmentation":[96],"is":[97],"applied":[98],"as":[99],"emphases":[102],"analysis.":[106],"Afterward,":[107],"auxiliary":[108],"captions":[109],"with":[110],"generated":[114],"aggregated.":[116],"Finally,":[117],"perform":[119],"interaction":[121],"align":[123],"outputs.":[124],"Extensive":[125],"evaluation":[126],"experiments":[127],"show":[128],"superiority":[130],"our":[132],"proposed":[133],"method,":[134],"demonstrate":[136],"effectiveness":[138],"diversifying":[140],"prompts":[141],"multiple":[143],"tuning":[146],"I-VL":[147],"models.":[148]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-30T00:00:00"}
