{"id":"https://openalex.org/W4415444078","doi":"https://doi.org/10.1109/tmm.2025.3618553","title":"SwimVG: Step-Wise Multimodal Fusion and Adaption for Visual Grounding","display_name":"SwimVG: Step-Wise Multimodal Fusion and Adaption for Visual Grounding","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4415444078","doi":"https://doi.org/10.1109/tmm.2025.3618553"},"language":null,"primary_location":{"id":"doi:10.1109/tmm.2025.3618553","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3618553","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046443025","display_name":"Liangtao Shi","orcid":null},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Liangtao Shi","raw_affiliation_strings":["Key Laboratory of Knowledge Engineering with Big Data, Hefei University of Technology, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Knowledge Engineering with Big Data, Hefei University of Technology, Hefei, China","institution_ids":["https://openalex.org/I16365422"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106405422","display_name":"Ting Liu","orcid":"https://orcid.org/0000-0002-2814-5485"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ting Liu","raw_affiliation_strings":["School of systems engineering, National University of Defense Technology, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of systems engineering, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102797916","display_name":"Xiantao Hu","orcid":"https://orcid.org/0009-0007-1541-1717"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiantao Hu","raw_affiliation_strings":["Department of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053458494","display_name":"Yue Hu","orcid":"https://orcid.org/0000-0002-8115-7020"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yue Hu","raw_affiliation_strings":["School of systems engineering, National University of Defense Technology, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of systems engineering, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100610161","display_name":"Quanjun Yin","orcid":"https://orcid.org/0000-0002-1633-174X"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Quanjun Yin","raw_affiliation_strings":["School of systems engineering, National University of Defense Technology, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of systems engineering, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5029727770","display_name":"Richang Hong","orcid":null},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Richang Hong","raw_affiliation_strings":["Key Laboratory of Knowledge Engineering with Big Data, Hefei University of Technology, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Knowledge Engineering with Big Data, Hefei University of Technology, Hefei, China","institution_ids":["https://openalex.org/I16365422"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5046443025"],"corresponding_institution_ids":["https://openalex.org/I16365422"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31709362,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"27","issue":null,"first_page":"9776","last_page":"9787"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.991100013256073,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.991100013256073,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.991100013256073,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9814000129699707,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.7573000192642212},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.6330999732017517},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6172999739646912},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.5196999907493591},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.42320001125335693},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.4074999988079071},{"id":"https://openalex.org/keywords/image-fusion","display_name":"Image fusion","score":0.36340001225471497}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8062000274658203},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.7573000192642212},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.6330999732017517},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6172999739646912},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5486999750137329},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.5196999907493591},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.42320001125335693},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.4074999988079071},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.38839998841285706},{"id":"https://openalex.org/C69744172","wikidata":"https://www.wikidata.org/wiki/Q860822","display_name":"Image fusion","level":3,"score":0.36340001225471497},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.35109999775886536},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C2777877512","wikidata":"https://www.wikidata.org/wiki/Q1116097","display_name":"Common ground","level":2,"score":0.3301999866962433},{"id":"https://openalex.org/C173414695","wikidata":"https://www.wikidata.org/wiki/Q5510276","display_name":"Fusion mechanism","level":4,"score":0.3188999891281128},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C9395851","wikidata":"https://www.wikidata.org/wiki/Q177929","display_name":"Stack (abstract data type)","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.26649999618530273},{"id":"https://openalex.org/C2982962833","wikidata":"https://www.wikidata.org/wiki/Q17092450","display_name":"Information fusion","level":2,"score":0.2648000121116638},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2025.3618553","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3618553","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":52,"referenced_works":["https://openalex.org/W1773149199","https://openalex.org/W1933349210","https://openalex.org/W2185175083","https://openalex.org/W2194775991","https://openalex.org/W2489434015","https://openalex.org/W2946086442","https://openalex.org/W2950697717","https://openalex.org/W2962766617","https://openalex.org/W2963109634","https://openalex.org/W2964284374","https://openalex.org/W2964345792","https://openalex.org/W2984121207","https://openalex.org/W2987734933","https://openalex.org/W3089758964","https://openalex.org/W3094502228","https://openalex.org/W3110435696","https://openalex.org/W3112077297","https://openalex.org/W3159619744","https://openalex.org/W3163747765","https://openalex.org/W3192448690","https://openalex.org/W4205991051","https://openalex.org/W4210820868","https://openalex.org/W4214490042","https://openalex.org/W4309181071","https://openalex.org/W4312351586","https://openalex.org/W4313124918","https://openalex.org/W4313145013","https://openalex.org/W4320036901","https://openalex.org/W4382458695","https://openalex.org/W4384820618","https://openalex.org/W4386071547","https://openalex.org/W4386076636","https://openalex.org/W4386212341","https://openalex.org/W4387272106","https://openalex.org/W4387623721","https://openalex.org/W4387986753","https://openalex.org/W4389331817","https://openalex.org/W4389352357","https://openalex.org/W4389520746","https://openalex.org/W4389890864","https://openalex.org/W4392543666","https://openalex.org/W4392904512","https://openalex.org/W4393159185","https://openalex.org/W4394564126","https://openalex.org/W4395064954","https://openalex.org/W4400314890","https://openalex.org/W4402716271","https://openalex.org/W4402728160","https://openalex.org/W4402979572","https://openalex.org/W4404783315","https://openalex.org/W4408564121","https://openalex.org/W4416011362"],"related_works":[],"abstract_inverted_index":{"Visual":[0],"grounding":[1],"aims":[2],"to":[3,61,144],"ground":[4],"an":[5],"image":[6],"region":[7],"through":[8],"natural":[9],"language,":[10],"which":[11],"heavily":[12],"relies":[13],"on":[14,150],"cross-modal":[15,84,127,140],"alignment.":[16],"Most":[17],"existing":[18],"methods":[19],"transfer":[20],"visual/linguistic":[21],"knowledge":[22],"separately":[23],"by":[24,31,111,126],"fully":[25],"fine-tuning":[26],"uni-modal":[27],"pre-trained":[28],"models,":[29],"followed":[30],"a":[32,67,114],"simple":[33],"stack":[34],"of":[35,165],"visual-language":[36],"transformers":[37],"for":[38,88,96],"multimodal":[39,69,80,97,124],"fusion.":[40,98],"However,":[41],"these":[42,63],"approaches":[43],"not":[44],"only":[45],"limit":[46],"adequate":[47],"interaction":[48],"between":[49,104],"visual":[50,89],"and":[51,71,83,107,130,136,160],"linguistic":[52],"contexts,":[53],"but":[54],"also":[55],"incur":[56],"significant":[57],"computational":[58],"costs.":[59],"Therefore,":[60],"address":[62],"issues,":[64],"we":[65],"explore":[66],"step-wise":[68,79],"fusion":[70,116,125],"adaption":[72],"framework,":[73],"namely":[74],"SwimVG.":[75],"Specifically,":[76],"SwimVG":[77,156],"proposes":[78],"prompts":[81],"(Swip)":[82],"interactive":[85],"adapters":[86],"(CIA)":[87],"grounding,":[90],"replacing":[91],"the":[92,102,105,139],"cumbersome":[93],"transformer":[94],"stacks":[95],"Swip":[99,129],"can":[100],"improve":[101],"alignment":[103],"vision":[106],"language":[108],"representations":[109],"step":[110],"step,":[112],"in":[113,163],"token-level":[115],"manner.":[117],"In":[118],"addition,":[119],"weight-level":[120],"CIA":[121,131],"further":[122],"promotes":[123],"interaction.":[128],"are":[132],"both":[133],"parameter-efficient":[134],"paradigms,":[135],"they":[137],"fuse":[138],"features":[141],"from":[142],"shallow":[143],"deep":[145],"layers":[146],"gradually.":[147],"Experimental":[148],"results":[149],"four":[151],"widely-used":[152],"benchmarks":[153],"demonstrate":[154],"that":[155],"achieves":[157],"remarkable":[158],"abilities":[159],"considerable":[161],"benefits":[162],"terms":[164],"efficiency.":[166]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-24T00:00:00"}
