{"id":"https://openalex.org/W7160955481","doi":"https://doi.org/10.48550/arxiv.2605.09858","title":"Clip-level Uncertainty and Temporal-aware Active Learning for End-to-End Multi-Object Tracking","display_name":"Clip-level Uncertainty and Temporal-aware Active Learning for End-to-End Multi-Object Tracking","publication_year":2026,"publication_date":"2026-05-11","ids":{"openalex":"https://openalex.org/W7160955481","doi":"https://doi.org/10.48550/arxiv.2605.09858"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.09858","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09858","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.09858","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5021392103","display_name":"Riku Inoue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Inoue, Riku","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023557292","display_name":"Shogo Sato","orcid":"https://orcid.org/0000-0002-2874-7072"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sato, Shogo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007740741","display_name":"Kazuhiko Murasaki","orcid":"https://orcid.org/0000-0001-7697-9575"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Murasaki, Kazuhiko","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079194190","display_name":"Tomoyasu Shimada","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shimada, Tomoyasu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058947969","display_name":"Toshihiko Nishimura","orcid":"https://orcid.org/0000-0002-4759-2483"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nishimura, Toshihiko","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5060947560","display_name":"Ryuichi Tanida","orcid":"https://orcid.org/0000-0002-5379-3150"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tanida, Ryuichi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9171000123023987,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9171000123023987,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.010200000368058681,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.006300000008195639,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6657999753952026},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.654699981212616},{"id":"https://openalex.org/keywords/active-learning","display_name":"Active learning (machine learning)","score":0.6200000047683716},{"id":"https://openalex.org/keywords/bittorrent-tracker","display_name":"BitTorrent tracker","score":0.5291000008583069},{"id":"https://openalex.org/keywords/tracking","display_name":"Tracking (education)","score":0.43779999017715454},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.41780000925064087},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.41339999437332153},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.39070001244544983},{"id":"https://openalex.org/keywords/contrast","display_name":"Contrast (vision)","score":0.38679999113082886}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7276999950408936},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6657999753952026},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.654699981212616},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6474999785423279},{"id":"https://openalex.org/C77967617","wikidata":"https://www.wikidata.org/wiki/Q4677561","display_name":"Active learning (machine learning)","level":2,"score":0.6200000047683716},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.574400007724762},{"id":"https://openalex.org/C57501372","wikidata":"https://www.wikidata.org/wiki/Q2021268","display_name":"BitTorrent tracker","level":3,"score":0.5291000008583069},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.43779999017715454},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.41780000925064087},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.41339999437332153},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.39070001244544983},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.38679999113082886},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.36309999227523804},{"id":"https://openalex.org/C193611912","wikidata":"https://www.wikidata.org/wiki/Q4677596","display_name":"Active vision","level":2,"score":0.3598000109195709},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.35339999198913574},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.3208000063896179},{"id":"https://openalex.org/C83248878","wikidata":"https://www.wikidata.org/wiki/Q344000","display_name":"Active appearance model","level":3,"score":0.3147999942302704},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.31040000915527344},{"id":"https://openalex.org/C121687571","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Activity recognition","level":2,"score":0.29989999532699585},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.29809999465942383},{"id":"https://openalex.org/C58973888","wikidata":"https://www.wikidata.org/wiki/Q1041418","display_name":"Semi-supervised learning","level":2,"score":0.2881999909877777},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.26510000228881836},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.26179999113082886}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.09858","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09858","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.09858","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09858","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multi-Object":[0],"Tracking":[1],"(MOT)":[2],"in":[3,47],"dynamic":[4],"environments":[5],"relies":[6],"on":[7,85,167],"robust":[8],"temporal":[9,28,129],"reasoning":[10],"to":[11,56,108,122,131,162],"maintain":[12],"consistent":[13],"object":[14],"identities":[15],"over":[16],"time.":[17],"Transformer-based":[18],"end-to-end":[19,78],"MOT":[20,65],"models":[21],"achieve":[22],"strong":[23,45],"performance":[24,145,160],"by":[25],"explicitly":[26],"modeling":[27],"dependencies,":[29],"yet":[30],"training":[31,83,176],"them":[32],"requires":[33],"extensive":[34],"bounding-box":[35],"and":[36,44,82,97,101,135,155],"identity":[37],"annotations.":[38],"Given":[39],"the":[40,69,149,174],"high":[41],"labeling":[42],"cost":[43],"redundancy":[46],"videos,":[48],"Active":[49,103],"Learning":[50,104],"(AL)":[51],"is":[52,73],"an":[53,133],"effective":[54],"approach":[55],"improve":[57],"annotation":[58],"efficiency.":[59],"However,":[60],"existing":[61],"AL":[62],"methods":[63],"for":[64,165],"primarily":[66],"operate":[67],"at":[68,148],"frame":[70],"level,":[71],"which":[72],"structurally":[74],"misaligned":[75],"with":[76],"modern":[77],"trackers":[79],"whose":[80],"inference":[81],"rely":[84],"multi-frame":[86,120],"clips.":[87],"To":[88],"bridge":[89],"this":[90],"gap,":[91],"we":[92],"formulate":[93],"clip-level":[94],"active":[95],"learning":[96],"propose":[98],"Clip-level":[99],"Uncertainty":[100],"Temporal-aware":[102],"(CUTAL).":[105],"In":[106],"contrast":[107],"frame-based":[109],"approaches,":[110],"CUTAL":[111,141,158],"scores":[112],"each":[113],"clip":[114],"using":[115,170],"uncertainty":[116],"metrics":[117],"derived":[118],"from":[119],"predictions":[121],"capture":[123],"inter-frame":[124],"correspondence":[125],"ambiguities,":[126],"while":[127],"enforcing":[128],"diversity":[130],"select":[132],"informative":[134],"non-redundant":[136],"subset.":[137],"Experiments":[138],"show":[139],"that":[140],"achieves":[142,159],"stronger":[143],"overall":[144],"than":[146],"baselines":[147],"same":[150],"label":[151],"budgets":[152],"across":[153],"MeMOTR":[154,166],"SambaMOTR.":[156],"Notably,":[157],"comparable":[161],"full":[163],"supervision":[164],"both":[168],"datasets":[169],"only":[171],"50%":[172],"of":[173],"labeled":[175],"data.":[177]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
