{"id":"https://openalex.org/W4414603387","doi":"https://doi.org/10.1109/tcsvt.2025.3615851","title":"Temporal-Enhanced Multimodal Transformer for Referring Multi-Object Tracking and Segmentation","display_name":"Temporal-Enhanced Multimodal Transformer for Referring Multi-Object Tracking and Segmentation","publication_year":2025,"publication_date":"2025-09-29","ids":{"openalex":"https://openalex.org/W4414603387","doi":"https://doi.org/10.1109/tcsvt.2025.3615851"},"language":"en","primary_location":{"id":"doi:10.1109/tcsvt.2025.3615851","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3615851","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054107548","display_name":"Changcheng Xiao","orcid":"https://orcid.org/0000-0002-4707-343X"},"institutions":[{"id":"https://openalex.org/I154893126","display_name":"Guizhou Normal University","ror":"https://ror.org/02x1pa065","country_code":"CN","type":"education","lineage":["https://openalex.org/I154893126"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Changcheng Xiao","raw_affiliation_strings":["School of Big Data and Computer Science, Guizhou Normal University, Guiyang, China"],"raw_orcid":"https://orcid.org/0000-0002-4707-343X","affiliations":[{"raw_affiliation_string":"School of Big Data and Computer Science, Guizhou Normal University, Guiyang, China","institution_ids":["https://openalex.org/I154893126"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101467863","display_name":"Qiong Cao","orcid":"https://orcid.org/0000-0001-8750-3505"},"institutions":[{"id":"https://openalex.org/I4210103986","display_name":"Jingdong (China)","ror":"https://ror.org/01dkjkq64","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210103986"]},{"id":"https://openalex.org/I4210156809","display_name":"Future Academy","ror":"https://ror.org/0570trv98","country_code":"EG","type":"education","lineage":["https://openalex.org/I4210156809"]}],"countries":["CN","EG"],"is_corresponding":false,"raw_author_name":"Qiong Cao","raw_affiliation_strings":["JD Future Academy, Beijing, China","JD Technology, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-8750-3505","affiliations":[{"raw_affiliation_string":"JD Future Academy, Beijing, China","institution_ids":["https://openalex.org/I4210156809"]},{"raw_affiliation_string":"JD Technology, Beijing, China","institution_ids":["https://openalex.org/I4210103986"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114911198","display_name":"Yujie Zhong","orcid":"https://orcid.org/0009-0007-9127-3387"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yujie Zhong","raw_affiliation_strings":["Meituan Inc., Beijing, China","Meituan Inc, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0007-9127-3387","affiliations":[{"raw_affiliation_string":"Meituan Inc., Beijing, China","institution_ids":[]},{"raw_affiliation_string":"Meituan Inc, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100368865","display_name":"Xiang Zhang","orcid":"https://orcid.org/0000-0002-5201-3802"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiang Zhang","raw_affiliation_strings":["College of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0000-0002-5201-3802","affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100453466","display_name":"Tao Wang","orcid":"https://orcid.org/0000-0002-0865-0062"},"institutions":[{"id":"https://openalex.org/I4210103986","display_name":"Jingdong (China)","ror":"https://ror.org/01dkjkq64","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210103986"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Wang","raw_affiliation_strings":["JD Technology, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"JD Technology, Beijing, China","institution_ids":["https://openalex.org/I4210103986"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027243864","display_name":"Canqun Yang","orcid":"https://orcid.org/0009-0008-4757-2475"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Canqun Yang","raw_affiliation_strings":["College of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0009-0008-4757-2475","affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5060251766","display_name":"Long Lan","orcid":"https://orcid.org/0000-0002-4238-8985"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Long Lan","raw_affiliation_strings":["College of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0000-0002-4238-8985","affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5054107548"],"corresponding_institution_ids":["https://openalex.org/I154893126"],"apc_list":null,"apc_paid":null,"fwci":2.1998,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.89438793,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":"36","issue":"3","first_page":"3652","last_page":"3664"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.8604999780654907,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.8604999780654907,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10711","display_name":"Target Tracking and Data Fusion in Sensor Networks","score":0.8359000086784363,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.7738999724388123,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6945000290870667},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5964000225067139},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5073000192642212},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.4763000011444092},{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.44510000944137573},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.429500013589859},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3917999863624573},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.3894999921321869},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.3765000104904175}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8330000042915344},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6945000290870667},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6527000069618225},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5964000225067139},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5073000192642212},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4869000017642975},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.4763000011444092},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.44510000944137573},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.429500013589859},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3917999863624573},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3894999921321869},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.3765000104904175},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.37470000982284546},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.35830000042915344},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3547999858856201},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.33320000767707825},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.31869998574256897},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3179999887943268},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.30570000410079956},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2863999903202057},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2858999967575073},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2770000100135803},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.27230000495910645},{"id":"https://openalex.org/C173414695","wikidata":"https://www.wikidata.org/wiki/Q5510276","display_name":"Fusion mechanism","level":4,"score":0.25699999928474426},{"id":"https://openalex.org/C90559484","wikidata":"https://www.wikidata.org/wiki/Q778379","display_name":"Expression (computer science)","level":2,"score":0.25679999589920044}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsvt.2025.3615851","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3615851","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2466138076","display_name":null,"funder_award_id":"62376282","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7784321418","display_name":null,"funder_award_id":"62262005","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":59,"referenced_works":["https://openalex.org/W2150066425","https://openalex.org/W2185175083","https://openalex.org/W2194775991","https://openalex.org/W2251512949","https://openalex.org/W2252355370","https://openalex.org/W2489434015","https://openalex.org/W2603203130","https://openalex.org/W2904910963","https://openalex.org/W2951390634","https://openalex.org/W2953920664","https://openalex.org/W2962703144","https://openalex.org/W2962914239","https://openalex.org/W2962942822","https://openalex.org/W2963354481","https://openalex.org/W2963521717","https://openalex.org/W2964284374","https://openalex.org/W2964345792","https://openalex.org/W2965373594","https://openalex.org/W2970231061","https://openalex.org/W2970603850","https://openalex.org/W2982723417","https://openalex.org/W2984121207","https://openalex.org/W2991023920","https://openalex.org/W3035564946","https://openalex.org/W3039885434","https://openalex.org/W3084173793","https://openalex.org/W3086436251","https://openalex.org/W3094000868","https://openalex.org/W3095753995","https://openalex.org/W3096609285","https://openalex.org/W3104844437","https://openalex.org/W3106763294","https://openalex.org/W3107094551","https://openalex.org/W3112077297","https://openalex.org/W3119686997","https://openalex.org/W3127242989","https://openalex.org/W3159619744","https://openalex.org/W3199093552","https://openalex.org/W3199576129","https://openalex.org/W4285130999","https://openalex.org/W4286904999","https://openalex.org/W4312473433","https://openalex.org/W4312815172","https://openalex.org/W4312956471","https://openalex.org/W4313054169","https://openalex.org/W4313072323","https://openalex.org/W4313117614","https://openalex.org/W4322730933","https://openalex.org/W4361983876","https://openalex.org/W4382240009","https://openalex.org/W4385245566","https://openalex.org/W4385575022","https://openalex.org/W4386076180","https://openalex.org/W4386083103","https://openalex.org/W4386108423","https://openalex.org/W4392543666","https://openalex.org/W4392904092","https://openalex.org/W4400721625","https://openalex.org/W4402726924"],"related_works":[],"abstract_inverted_index":{"Referring":[0,154],"multi-object":[1,209],"tracking":[2,210],"(RMOT)":[3],"is":[4],"an":[5,13],"emerging":[6],"cross-modal":[7,96],"task":[8,33,152],"that":[9,129],"aims":[10],"to":[11,84,111,139,189],"locate":[12],"arbitrary":[14],"number":[15],"of":[16,37,47,89,118,135,143,171],"target":[17,48],"objects":[18,138],"and":[19,39,59,81,157,160,177,211],"maintain":[20],"their":[21,144],"identities":[22],"referred":[23],"by":[24],"a":[25,29,68,125,150,162,185],"language":[26],"expression":[27,179],"in":[28,194],"video.":[30],"This":[31],"intricate":[32],"involves":[34],"the":[35,44,51,87,100,104,119,136,141,190,207,212],"reasoning":[36],"linguistic":[38],"visual":[40],"modalities,":[41],"along":[42],"with":[43,174],"temporal":[45,132],"association":[46],"objects.":[49,121],"However,":[50],"seminal":[52],"work":[53],"relies":[54],"on":[55,205],"loose":[56],"feature":[57,76],"fusion":[58,77,97],"neglects":[60],"long-term":[61],"information.":[62],"In":[63,103,146],"this":[64],"study,":[65],"we":[66,93,107,123,148],"introduce":[67,124,149],"compact":[69],"Transformer-based":[70],"method,":[71],"termed":[72],"TenRMOT.":[73],"We":[74],"conduct":[75],"at":[78],"both":[79,206],"encoding":[80,101],"decoding":[82,105],"stages":[83],"fully":[85],"exploit":[86],"advantages":[88],"Transformer":[90],"architecture.":[91],"Specifically,":[92],"incrementally":[94],"perform":[95],"layer-by-layer":[98],"during":[99],"phase.":[102],"phase,":[106],"utilize":[108],"language-guided":[109],"queries":[110],"probe":[112],"memory":[113],"features":[114],"for":[115],"accurate":[116],"prediction":[117],"desired":[120],"Moreover,":[122],"query":[126],"update":[127],"module":[128],"explicitly":[130],"leverages":[131],"prior":[133],"information":[134],"tracked":[137],"enhance":[140],"consistency":[142],"trajectories.":[145],"addition,":[147],"novel":[151],"called":[153],"Multi-Object":[155],"Tracking":[156],"Segmentation":[158],"(RMOTS)":[159],"construct":[161],"new":[163],"dataset":[164,169],"named":[165],"Ref-KITTI":[166],"Segmentation.":[167],"Our":[168],"consists":[170],"18":[172],"videos":[173],"818":[175],"expressions,":[176],"each":[178],"averages":[180],"10.7":[181],"masks,":[182],"which":[183],"poses":[184],"greater":[186],"challenge":[187],"compared":[188],"typical":[191],"single":[192],"mask":[193],"most":[195],"existing":[196],"referring":[197,208],"video":[198],"segmentation":[199,213],"datasets.":[200],"TenRMOT":[201],"demonstrates":[202],"superior":[203],"performance":[204],"tasks.":[214]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
