{"id":"https://openalex.org/W4392902794","doi":"https://doi.org/10.1109/icassp48485.2024.10446122","title":"Textual Tokens Classification for Multi-Modal Alignment in Vision-Language Tracking","display_name":"Textual Tokens Classification for Multi-Modal Alignment in Vision-Language Tracking","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392902794","doi":"https://doi.org/10.1109/icassp48485.2024.10446122"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446122","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446122","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5022365489","display_name":"Zhongjie Mao","orcid":"https://orcid.org/0000-0002-5738-2340"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhongjie Mao","raw_affiliation_strings":["School of Computer Science, Wuhan University"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Wuhan University","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100410460","display_name":"Yucheng Wang","orcid":"https://orcid.org/0000-0002-6185-1680"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yucheng Wang","raw_affiliation_strings":["School of Computer Science, Wuhan University"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Wuhan University","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101723949","display_name":"Xi Chen","orcid":"https://orcid.org/0000-0002-0735-9919"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xi Chen","raw_affiliation_strings":["School of Computer Science, Wuhan University"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Wuhan University","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101705676","display_name":"Jia Yan","orcid":"https://orcid.org/0000-0001-5402-4698"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jia Yan","raw_affiliation_strings":["School of Electronic Information, Wuhan University,Department of Electrical Engineering","Department of Electrical Engineering, School of Electronic Information, Wuhan University"],"affiliations":[{"raw_affiliation_string":"School of Electronic Information, Wuhan University,Department of Electrical Engineering","institution_ids":["https://openalex.org/I37461747"]},{"raw_affiliation_string":"Department of Electrical Engineering, School of Electronic Information, Wuhan University","institution_ids":["https://openalex.org/I37461747"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5022365489"],"corresponding_institution_ids":["https://openalex.org/I37461747"],"apc_list":null,"apc_paid":null,"fwci":0.4767,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.59692184,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"8025","last_page":"8029"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8463718891143799},{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.7262887358665466},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6707131862640381},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6193181872367859},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6137229800224304},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5607908368110657},{"id":"https://openalex.org/keywords/bittorrent-tracker","display_name":"BitTorrent tracker","score":0.558035135269165},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5435959100723267},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5334081649780273},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.4456261396408081},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4222847819328308},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.41310322284698486},{"id":"https://openalex.org/keywords/tracking","display_name":"Tracking (education)","score":0.4108661413192749},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.37051117420196533},{"id":"https://openalex.org/keywords/eye-tracking","display_name":"Eye tracking","score":0.2327500879764557}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8463718891143799},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.7262887358665466},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6707131862640381},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6193181872367859},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6137229800224304},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5607908368110657},{"id":"https://openalex.org/C57501372","wikidata":"https://www.wikidata.org/wiki/Q2021268","display_name":"BitTorrent tracker","level":3,"score":0.558035135269165},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5435959100723267},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5334081649780273},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.4456261396408081},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4222847819328308},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.41310322284698486},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.4108661413192749},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.37051117420196533},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.2327500879764557},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C19417346","wikidata":"https://www.wikidata.org/wiki/Q7922","display_name":"Pedagogy","level":1,"score":0.0},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446122","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446122","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5799999833106995}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W2747053578","https://openalex.org/W2891033863","https://openalex.org/W2896457183","https://openalex.org/W2966759264","https://openalex.org/W2997896013","https://openalex.org/W3035453691","https://openalex.org/W3035672751","https://openalex.org/W3167536469","https://openalex.org/W3173871266","https://openalex.org/W3181069167","https://openalex.org/W3204554907","https://openalex.org/W3214586131","https://openalex.org/W4214759957","https://openalex.org/W4226077544","https://openalex.org/W4283825622","https://openalex.org/W4292828074","https://openalex.org/W4310922994","https://openalex.org/W4312751983","https://openalex.org/W4313156423","https://openalex.org/W4321766229","https://openalex.org/W4386075643","https://openalex.org/W6805147364","https://openalex.org/W6839144149","https://openalex.org/W6847178612"],"related_works":["https://openalex.org/W2366403280","https://openalex.org/W1495108544","https://openalex.org/W2091301346","https://openalex.org/W3148229873","https://openalex.org/W4389760904","https://openalex.org/W2150160875","https://openalex.org/W4242223894","https://openalex.org/W4306886878","https://openalex.org/W2973759123","https://openalex.org/W1517524280"],"abstract_inverted_index":{"Most":[0],"vision-language":[1],"(VL)":[2],"trackers":[3],"rely":[4],"on":[5,76,117],"coarse-grained":[6],"information":[7,16],"from":[8],"sentences":[9],"to":[10,28,55,65,90],"achieve":[11],"multi-modal":[12,39,63,79,111],"alignment.":[13],"However,":[14],"this":[15,57],"is":[17],"insufficient":[18],"for":[19],"accurately":[20,91],"describing":[21],"the":[22,29,62,82,87,93,98,110,121],"target":[23,94],"in":[24,81],"each":[25],"frame":[26],"due":[27],"inherent":[30],"ambiguity,":[31],"summarization,":[32],"and":[33,72,95,113],"invariance":[34],"of":[35,123],"sentences,":[36],"thereby":[37],"making":[38],"alignment":[40,80],"challenging.":[41],"This":[42,85],"paper":[43],"introduces":[44],"TTCTrack,":[45],"a":[46,104],"novel":[47],"VL":[48],"tracker":[49],"that":[50,107],"employs":[51],"textual":[52,67,88],"token":[53],"classification":[54],"address":[56],"challenge.":[58],"Specifically,":[59],"we":[60,102],"exploit":[61],"cross-relations":[64],"classify":[66],"tokens":[68,89],"into":[69],"various":[70],"types":[71],"employ":[73],"diverse":[74],"operations":[75],"them,":[77],"enabling":[78],"tracking":[83,126],"process.":[84],"enables":[86],"describe":[92],"dynamically":[96],"reflect":[97],"scene":[99],"changes.":[100],"Moreover,":[101],"introduce":[103],"dual-encoder":[105],"structure":[106],"effectively":[108],"handles":[109],"input":[112],"fusion.":[114],"Extensive":[115],"experiments":[116],"four":[118],"datasets":[119],"demonstrate":[120],"effectiveness":[122],"our":[124],"proposed":[125],"method.":[127]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
