{"id":"https://openalex.org/W4409985860","doi":"https://doi.org/10.1109/tmm.2025.3565984","title":"Multi-Modal Hybrid Interaction Vision-Language Tracking","display_name":"Multi-Modal Hybrid Interaction Vision-Language Tracking","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4409985860","doi":"https://doi.org/10.1109/tmm.2025.3565984"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2025.3565984","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3565984","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5086427279","display_name":"Lei Lei","orcid":"https://orcid.org/0000-0001-8430-1246"},"institutions":[{"id":"https://openalex.org/I29739308","display_name":"Guangxi Normal University","ror":"https://ror.org/02frt9q65","country_code":"CN","type":"education","lineage":["https://openalex.org/I29739308"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Lei Lei","raw_affiliation_strings":["Key Lab of Education Blockchain and Intelligent Technology, Ministry of Education, Guangxi Normal University, Guilin, China"],"affiliations":[{"raw_affiliation_string":"Key Lab of Education Blockchain and Intelligent Technology, Ministry of Education, Guangxi Normal University, Guilin, China","institution_ids":["https://openalex.org/I29739308"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5037037839","display_name":"Xianxian Li","orcid":"https://orcid.org/0000-0002-7083-3847"},"institutions":[{"id":"https://openalex.org/I29739308","display_name":"Guangxi Normal University","ror":"https://ror.org/02frt9q65","country_code":"CN","type":"education","lineage":["https://openalex.org/I29739308"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xianxian Li","raw_affiliation_strings":["Key Lab of Education Blockchain and Intelligent Technology, Ministry of Education, Guangxi Normal University, Guilin, China"],"affiliations":[{"raw_affiliation_string":"Key Lab of Education Blockchain and Intelligent Technology, Ministry of Education, Guangxi Normal University, Guilin, China","institution_ids":["https://openalex.org/I29739308"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5086427279"],"corresponding_institution_ids":["https://openalex.org/I29739308"],"apc_list":null,"apc_paid":null,"fwci":4.0344,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.92661787,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":"27","issue":null,"first_page":"5857","last_page":"5865"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11707","display_name":"Gaze Tracking and Assistive Technology","score":0.9869999885559082,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11707","display_name":"Gaze Tracking and Assistive Technology","score":0.9869999885559082,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9672999978065491,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8467586040496826},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.547898530960083},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.536077618598938},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5224663019180298},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.33621397614479065}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8467586040496826},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.547898530960083},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.536077618598938},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5224663019180298},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33621397614479065},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2025.3565984","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3565984","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2108598243","https://openalex.org/W2154889144","https://openalex.org/W2470394683","https://openalex.org/W2592463526","https://openalex.org/W2747053578","https://openalex.org/W2799058067","https://openalex.org/W2895906665","https://openalex.org/W2896457183","https://openalex.org/W2898200825","https://openalex.org/W2963109634","https://openalex.org/W3010072143","https://openalex.org/W3035511673","https://openalex.org/W3094502228","https://openalex.org/W3106542916","https://openalex.org/W3173871266","https://openalex.org/W3181069167","https://openalex.org/W3214586131","https://openalex.org/W4214759957","https://openalex.org/W4289535600","https://openalex.org/W4292828074","https://openalex.org/W4312751983","https://openalex.org/W4312805142","https://openalex.org/W4313307856","https://openalex.org/W4380520387","https://openalex.org/W4382119472","https://openalex.org/W4385569741","https://openalex.org/W4386066081","https://openalex.org/W4386075643","https://openalex.org/W4387272128","https://openalex.org/W4390871890","https://openalex.org/W4393159404","https://openalex.org/W4396505919","https://openalex.org/W4396598146","https://openalex.org/W4402753915","https://openalex.org/W4402754150","https://openalex.org/W4403758852","https://openalex.org/W4403791254"],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Vision-language":[0],"tracking":[1,126],"is":[2,54,92,137,175,210],"a":[3,50,101,121,132,155,159,165],"crucial":[4],"branch":[5],"of":[6,78,88,144,154,182,225],"multi-modal":[7,58,90,122,133,145,149,203,215,227],"object":[8,15],"tracking,":[9],"aiming":[10],"to":[11,32,56,60,75,139,177,201,212,266],"jointly":[12],"locate":[13,61],"an":[14],"by":[16,95],"utilizing":[17],"visual":[18,30,39,46,70,197],"information":[19,81,87,143,181,224],"and":[20,29,38,47,71,164,198,253],"language":[21,28,36,48,72,83,199],"descriptions.":[22],"Typically,":[23],"existing":[24,267],"vision-language":[25,104,125,260,269],"trackers":[26,105],"employ":[27],"encoders":[31],"extract":[33,57],"features":[34,59,91,200,216],"from":[35,185],"descriptions":[37],"information,":[40,220],"respectively.":[41],"Based":[42],"on":[43,245],"these":[44],"extracted":[45],"features,":[49],"cross-modal":[51,97,194],"interaction":[52,98,124,135,151,157,162,167,173,191,195,208],"module":[53,174,192,209],"used":[55],"the":[62,67,76,85,89,96,103,112,141,170,183,189,206,214,222,226,230,239,258],"targets.":[63],"However,":[64],"they":[65],"ignore":[66],"differences":[68],"between":[69,196],"modalities.":[73],"Due":[74],"lack":[77],"pixel-level":[79],"position":[80,161,172,180,219],"in":[82,111,130,238],"descriptions,":[84],"positional":[86,142,223],"greatly":[93],"weakened":[94],"modules.":[99],"As":[100],"result,":[102],"cannot":[106],"effectively":[107,234],"capture":[108,178,235],"subtle":[109,236],"changes":[110,237],"target's":[113,240],"positions.":[114,241],"To":[115],"address":[116],"this":[117],"problem,":[118],"we":[119,255],"propose":[120],"hybrid":[123,134,150,166,207],"method":[127],"(named":[128],"MHITrack),":[129],"which":[131],"decoder":[136,152],"designed":[138],"enhance":[140],"features.":[146,187,204,228],"The":[147],"proposed":[148,231,259],"consists":[153],"visual-language":[156,190],"module,":[158,163],"multi-level":[160,171,186],"module.":[168],"Firstly,":[169],"utilized":[176],"fine-grained":[179],"target":[184,218],"Meanwhile,":[188],"performs":[193],"obtain":[202],"Furthermore,":[205],"employed":[211],"integrate":[213],"with":[217],"enhancing":[221],"Finally,":[229],"tracker":[232,261],"can":[233],"Through":[242],"extensive":[243],"experiments":[244],"four":[246],"benchmark":[247],"datasets,":[248],"namely":[249],"TNL2k,":[250],"LaSOT,":[251],"OTB-Lang,":[252],"LaSOText,":[254],"demonstrate":[256],"that":[257],"achieves":[262],"promising":[263],"performance":[264],"compared":[265],"state-of-the-art":[268],"trackers.":[270]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
