{"id":"https://openalex.org/W4412991841","doi":"https://doi.org/10.1145/3757322","title":"Language-guided Visual Tracking: Comprehensive and Effective Multimodal Information Fusion","display_name":"Language-guided Visual Tracking: Comprehensive and Effective Multimodal Information Fusion","publication_year":2025,"publication_date":"2025-08-05","ids":{"openalex":"https://openalex.org/W4412991841","doi":"https://doi.org/10.1145/3757322"},"language":"en","primary_location":{"id":"doi:10.1145/3757322","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3757322","pdf_url":null,"source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047559536","display_name":"Jianbo Song","orcid":"https://orcid.org/0000-0001-5847-9630"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jianbo Song","raw_affiliation_strings":["Image Processing Center, School of Astronautics, Beihang University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5847-9630","affiliations":[{"raw_affiliation_string":"Image Processing Center, School of Astronautics, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100430263","display_name":"Hong Zhang","orcid":"https://orcid.org/0000-0002-1282-3755"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hong Zhang","raw_affiliation_strings":["Image Processing Center, School of Astronautics, Beihang University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-1282-3755","affiliations":[{"raw_affiliation_string":"Image Processing Center, School of Astronautics, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100628033","display_name":"Yachun Feng","orcid":"https://orcid.org/0009-0003-5891-8674"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yachun Feng","raw_affiliation_strings":["School of Mechanical Engineering and Automation, Beihang University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0003-5891-8674","affiliations":[{"raw_affiliation_string":"School of Mechanical Engineering and Automation, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101709858","display_name":"Hanyang Liu","orcid":"https://orcid.org/0009-0009-8956-1245"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hanyang Liu","raw_affiliation_strings":["Image Processing Center, School of Astronautics, Beihang University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-8956-1245","affiliations":[{"raw_affiliation_string":"Image Processing Center, School of Astronautics, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100629264","display_name":"Y. F. Yang","orcid":"https://orcid.org/0000-0003-4237-5874"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yifan Yang","raw_affiliation_strings":["Image Processing Center, School of Astronautics, Beihang University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-4237-5874","affiliations":[{"raw_affiliation_string":"Image Processing Center, School of Astronautics, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5047559536"],"corresponding_institution_ids":["https://openalex.org/I82880672"],"apc_list":null,"apc_paid":null,"fwci":1.0044,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.78748168,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"21","issue":"10","first_page":"1","last_page":"23"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12597","display_name":"Fire Detection and Safety Systems","score":0.9661999940872192,"subfield":{"id":"https://openalex.org/subfields/2213","display_name":"Safety, Risk, Reliability and Quality"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9606999754905701,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8988628387451172},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.5478385090827942},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.5440837144851685},{"id":"https://openalex.org/keywords/information-fusion","display_name":"Information fusion","score":0.512122392654419},{"id":"https://openalex.org/keywords/eye-tracking","display_name":"Eye tracking","score":0.48337239027023315},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4187925457954407},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.37485843896865845},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3588332533836365}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8988628387451172},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5478385090827942},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.5440837144851685},{"id":"https://openalex.org/C2982962833","wikidata":"https://www.wikidata.org/wiki/Q17092450","display_name":"Information fusion","level":2,"score":0.512122392654419},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.48337239027023315},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4187925457954407},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.37485843896865845},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3588332533836365}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3757322","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3757322","pdf_url":null,"source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G711644270","display_name":null,"funder_award_id":"62002005","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":49,"referenced_works":["https://openalex.org/W2158592639","https://openalex.org/W2470394683","https://openalex.org/W2558687840","https://openalex.org/W2582998992","https://openalex.org/W2747053578","https://openalex.org/W2794744029","https://openalex.org/W2799058067","https://openalex.org/W2891033863","https://openalex.org/W2963109634","https://openalex.org/W2998434318","https://openalex.org/W3021642200","https://openalex.org/W3035466700","https://openalex.org/W3035524453","https://openalex.org/W3090155371","https://openalex.org/W3096609285","https://openalex.org/W3167536469","https://openalex.org/W3173871266","https://openalex.org/W3179950005","https://openalex.org/W3181069167","https://openalex.org/W3184735396","https://openalex.org/W3204554907","https://openalex.org/W3214586131","https://openalex.org/W4214759957","https://openalex.org/W4214810297","https://openalex.org/W4214872590","https://openalex.org/W4226024706","https://openalex.org/W4283825622","https://openalex.org/W4286901804","https://openalex.org/W4293067233","https://openalex.org/W4312323989","https://openalex.org/W4312532041","https://openalex.org/W4312735552","https://openalex.org/W4312751983","https://openalex.org/W4312805142","https://openalex.org/W4312877428","https://openalex.org/W4313307856","https://openalex.org/W4385245566","https://openalex.org/W4385569741","https://openalex.org/W4386065544","https://openalex.org/W4386065598","https://openalex.org/W4386066081","https://openalex.org/W4386066459","https://openalex.org/W4386075643","https://openalex.org/W4386076180","https://openalex.org/W4390872215","https://openalex.org/W4390874039","https://openalex.org/W4393178550","https://openalex.org/W4402727764","https://openalex.org/W6839144149"],"related_works":["https://openalex.org/W2035104213","https://openalex.org/W2240967841","https://openalex.org/W2791686160","https://openalex.org/W1513257435","https://openalex.org/W2289840258","https://openalex.org/W2503352526","https://openalex.org/W2968405290","https://openalex.org/W2960696728","https://openalex.org/W2114040463","https://openalex.org/W2053633997"],"abstract_inverted_index":{"Current":[0],"vision-language":[1,132],"trackers":[2],"often":[3],"struggle":[4],"to":[5,13,30,54,63,130],"fuse":[6],"multimodal":[7,17,57,88,121],"information":[8,74],"comprehensively":[9],"and":[10,35,40,59,77,102,154],"effectively,":[11],"leading":[12],"suboptimal":[14],"performance":[15],"in":[16,134],"tasks.":[18],"This":[19],"study":[20],"introduces":[21],"LGTrack,":[22],"a":[23,32,87,116,124],"novel":[24],"language-guided":[25],"visual":[26],"tracking":[27],"framework":[28],"designed":[29],"achieve":[31,55,115],"more":[33,117],"comprehensive":[34,118],"efficient":[36,107],"fusion":[37,119,141],"of":[38,80,109,120],"vision":[39,81,101],"language":[41,103],"information.":[42,122],"In":[43,83],"the":[44,84,106,110,135,140],"encoding":[45],"stage,":[46,86],"an":[47],"Enhanced":[48],"Multimodal":[49,68],"Interaction":[50],"Module":[51],"is":[52,61,94,128],"proposed":[53],"full":[56],"fusion,":[58],"it":[60],"used":[62],"construct":[64],"Early":[65],"Language":[66],"Multilevel-guided":[67],"Encoding,":[69],"which":[70],"leverages":[71],"deep":[72],"semantic":[73,136],"for":[75],"early":[76],"multilevel":[78],"guidance":[79],"encoding.":[82],"decoding":[85,89,111],"based":[90],"on":[91,145],"Joint":[92],"Query":[93],"proposed,":[95],"utilizing":[96],"global":[97],"features":[98,133],"from":[99],"both":[100],"modalities,":[104],"guiding":[105],"operation":[108],"layers.":[112],"These":[113],"innovations":[114],"Additionally,":[123],"contrastive":[125],"learning":[126],"strategy":[127],"introduced":[129],"align":[131],"space,":[137],"further":[138],"enhancing":[139],"effectiveness.":[142],"Extensive":[143],"experiments":[144],"multiple":[146],"benchmarks":[147],"such":[148],"as":[149],"LaSOT,":[150],"\\(\\rm{LaSOT_{ext}}\\)":[151],",":[152],"TNL2K,":[153],"OTB99-Lang":[155],"demonstrate":[156],"that":[157],"our":[158],"approach":[159],"outperforms":[160],"existing":[161],"state-of-the-art":[162],"trackers.":[163]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
