{"id":"https://openalex.org/W7131620229","doi":"https://doi.org/10.48550/arxiv.2602.22033","title":"RT-RMOT: A Dataset and Framework for RGB-Thermal Referring Multi-Object Tracking","display_name":"RT-RMOT: A Dataset and Framework for RGB-Thermal Referring Multi-Object Tracking","publication_year":2026,"publication_date":"2026-02-25","ids":{"openalex":"https://openalex.org/W7131620229","doi":"https://doi.org/10.48550/arxiv.2602.22033"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.22033","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125160124","display_name":"Yanqiu Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yu, Yanqiu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126917583","display_name":"Zhifan Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Zhifan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126918744","display_name":"Sijia Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Sijia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126856097","display_name":"Tongfei Chu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chu, Tongfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126871106","display_name":"En Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, En","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102630555","display_name":"Liman Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Liman","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125173629","display_name":"Wenbing Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao, Wenbing","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5125160124"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.46709999442100525,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.46709999442100525,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.22990000247955322,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11707","display_name":"Gaze Tracking and Assistive Technology","score":0.034699998795986176,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7236999869346619},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.6455000042915344},{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.4634999930858612},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4487999975681305},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.435699999332428},{"id":"https://openalex.org/keywords/completeness","display_name":"Completeness (order theory)","score":0.3935999870300293},{"id":"https://openalex.org/keywords/tracking","display_name":"Tracking (education)","score":0.33090001344680786},{"id":"https://openalex.org/keywords/eye-tracking","display_name":"Eye tracking","score":0.3228999972343445}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.814300000667572},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7236999869346619},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6694999933242798},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.6455000042915344},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.4634999930858612},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4487999975681305},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.435699999332428},{"id":"https://openalex.org/C17231256","wikidata":"https://www.wikidata.org/wiki/Q5156540","display_name":"Completeness (order theory)","level":2,"score":0.3935999870300293},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38769999146461487},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.33090001344680786},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.3228999972343445},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.31529998779296875},{"id":"https://openalex.org/C154586513","wikidata":"https://www.wikidata.org/wiki/Q4420972","display_name":"Tracking system","level":3,"score":0.31529998779296875},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3025999963283539},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.29190000891685486},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.27390000224113464},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.2694999873638153},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.2612000107765198},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.2605000138282776}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.22033","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.22033","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.22033","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.22033","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Referring":[0,71],"Multi-Object":[1,72],"Tracking":[2,73],"has":[3],"attracted":[4],"increasing":[5],"attention":[6],"due":[7],"to":[8,43,56,132,153,168],"its":[9],"human-friendly":[10],"interactive":[11],"characteristics,":[12],"yet":[13],"it":[14],"exhibits":[15],"limitations":[16],"in":[17],"low-visibility":[18],"conditions,":[19],"such":[20],"as":[21],"nighttime,":[22],"smoke,":[23],"and":[24,88,111,164,171,177],"other":[25],"challenging":[26],"scenarios.":[27],"To":[28,62,138],"overcome":[29],"this":[30],"limitation,":[31],"we":[32,67,94,123,145,159],"propose":[33,95],"a":[34,97,101,125,147],"new":[35],"RGB-Thermal":[36,76],"RMOT":[37],"task,":[38],"named":[39,78],"RT-RMOT,":[40,66],"which":[41],"aims":[42],"fuse":[44],"RGB":[45],"appearance":[46],"features":[47],"with":[48],"the":[49,53,69,115,135,175,185,189,192],"illumination":[50],"robustness":[51],"of":[52,179,191],"thermal":[54],"modality":[55],"enable":[57],"all-day":[58],"referring":[59],"multi-object":[60],"tracking.":[61],"promote":[63],"research":[64],"on":[65,184],"construct":[68],"first":[70],"dataset":[74,187],"under":[75],"modality,":[77],"RefRT.":[79],"It":[80],"contains":[81],"388":[82],"language":[83,104],"descriptions,":[84],"1,250":[85],"tracked":[86],"targets,":[87],"166,147":[89],"Language-RGB-Thermal":[90],"(L-RGB-T)":[91],"triplets.":[92],"Furthermore,":[93],"RTrack,":[96],"framework":[98,117],"built":[99],"upon":[100],"multimodal":[102],"large":[103],"model":[105],"(MLLM)":[106],"that":[107],"integrates":[108],"RGB,":[109],"thermal,":[110],"textual":[112],"features.":[113],"Since":[114],"initial":[116],"still":[118],"leaves":[119],"room":[120],"for":[121],"improvement,":[122],"introduce":[124,146],"Group":[126],"Sequence":[127],"Policy":[128],"Optimization":[129],"(GSPO)":[130],"strategy":[131,152],"further":[133],"exploit":[134],"model's":[136],"potential.":[137],"alleviate":[139],"training":[140],"instability":[141],"during":[142],"RL":[143],"fine-tuning,":[144],"Clipped":[148],"Advantage":[149],"Scaling":[150],"(CAS)":[151],"suppress":[154],"gradient":[155],"explosion.":[156],"In":[157],"addition,":[158],"design":[160],"Structured":[161],"Output":[162],"Reward":[163,167],"Comprehensive":[165],"Detection":[166],"balance":[169],"exploration":[170],"exploitation,":[172],"thereby":[173],"improving":[174],"completeness":[176],"accuracy":[178],"target":[180],"perception.":[181],"Extensive":[182],"experiments":[183],"RefRT":[186],"demonstrate":[188],"effectiveness":[190],"proposed":[193],"RTrack":[194],"framework.":[195]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-27T00:00:00"}
