{"id":"https://openalex.org/W4415539917","doi":"https://doi.org/10.1145/3746027.3755617","title":"End-to-End Multiple Object Tracking with Dynamic Scene Perception","display_name":"End-to-End Multiple Object Tracking with Dynamic Scene Perception","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415539917","doi":"https://doi.org/10.1145/3746027.3755617"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755617","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755617","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067428532","display_name":"Ruonan Wei","orcid":"https://orcid.org/0000-0002-2562-6021"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ruonan Wei","raw_affiliation_strings":["School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China"],"raw_orcid":"https://orcid.org/0000-0002-2562-6021","affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015905456","display_name":"Yuntao Wang","orcid":"https://orcid.org/0000-0001-5232-008X"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuntao Wang","raw_affiliation_strings":["Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China","School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China"],"raw_orcid":"https://orcid.org/0000-0001-5232-008X","affiliations":[{"raw_affiliation_string":"Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]},{"raw_affiliation_string":"School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035081565","display_name":"Siyan Fang","orcid":"https://orcid.org/0000-0002-3762-0479"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Siyan Fang","raw_affiliation_strings":["School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan City, China"],"raw_orcid":"https://orcid.org/0000-0002-3762-0479","affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan City, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5030517812","display_name":"Yuehuan Wang","orcid":"https://orcid.org/0000-0001-7046-7587"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuehuan Wang","raw_affiliation_strings":["Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China","School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China"],"raw_orcid":"https://orcid.org/0000-0001-7046-7587","affiliations":[{"raw_affiliation_string":"Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]},{"raw_affiliation_string":"School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5067428532"],"corresponding_institution_ids":["https://openalex.org/I47720641"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.28876865,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"8682","last_page":"8691"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6585000157356262},{"id":"https://openalex.org/keywords/bittorrent-tracker","display_name":"BitTorrent tracker","score":0.6207000017166138},{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.5909000039100647},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5713000297546387},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5590999722480774},{"id":"https://openalex.org/keywords/tracking","display_name":"Tracking (education)","score":0.5306000113487244},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.4749999940395355},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4334000051021576}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7670999765396118},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.723800003528595},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6783000230789185},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6585000157356262},{"id":"https://openalex.org/C57501372","wikidata":"https://www.wikidata.org/wiki/Q2021268","display_name":"BitTorrent tracker","level":3,"score":0.6207000017166138},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.5909000039100647},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5713000297546387},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5590999722480774},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.5306000113487244},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.4749999940395355},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4334000051021576},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4219000041484833},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.3910999894142151},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.3707999885082245},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.3319000005722046},{"id":"https://openalex.org/C83248878","wikidata":"https://www.wikidata.org/wiki/Q344000","display_name":"Active appearance model","level":3,"score":0.31790000200271606},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.29490000009536743},{"id":"https://openalex.org/C142853389","wikidata":"https://www.wikidata.org/wiki/Q744778","display_name":"Association (psychology)","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C2983325608","wikidata":"https://www.wikidata.org/wiki/Q17084606","display_name":"Data association","level":3,"score":0.26010000705718994}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755617","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755617","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W2124781496","https://openalex.org/W2194775991","https://openalex.org/W2252355370","https://openalex.org/W2603203130","https://openalex.org/W2922509574","https://openalex.org/W3035574168","https://openalex.org/W3095753995","https://openalex.org/W3106763294","https://openalex.org/W3119686997","https://openalex.org/W4286904999","https://openalex.org/W4312473433","https://openalex.org/W4313072323","https://openalex.org/W4385575022","https://openalex.org/W4386076204","https://openalex.org/W4386083103","https://openalex.org/W4390872615","https://openalex.org/W4391892422","https://openalex.org/W4394761924","https://openalex.org/W4400721625","https://openalex.org/W4402848704","https://openalex.org/W4402904054"],"related_works":[],"abstract_inverted_index":{"End-to-end":[0],"Multiple":[1],"Object":[2],"Tracking":[3],"(MOT)":[4],"frameworks":[5],"integrate":[6],"detection":[7],"and":[8,18,77,86,102,141,152],"tracking":[9,49],"into":[10,74,144],"a":[11,64,94,110],"unified":[12],"model,":[13],"avoiding":[14],"intermediate":[15],"information":[16,135,143],"loss":[17],"complicated":[19],"post-processing.":[20],"However,":[21],"existing":[22],"end-to-end":[23,164],"MOT":[24,66,157],"trackers":[25,165],"rely":[26],"on":[27,123,156],"track":[28],"queries":[29],"of":[30],"the":[31,148],"previous":[32],"frame":[33],"to":[34,44,104,114],"provide":[35],"prior":[36],"information.":[37],"Their":[38],"limited":[39],"short-term":[40,82,106,142],"temporal":[41],"modeling":[42],"struggle":[43],"cope":[45],"with":[46,81,173],"high":[47,174],"dynamic":[48,120],"scenarios,":[50],"where":[51],"inter-frame":[52],"target":[53],"variations":[54],"exhibit":[55],"significant":[56],"heterogeneity.":[57],"To":[58],"address":[59],"these":[60],"shortcomings,":[61],"we":[62],"propose":[63],"scene-perception":[65],"framework":[67],"(SP-MOT)":[68],"that":[69,98,132,160],"encodes":[70],"scene":[71,96,126,134],"context":[72,111],"understanding":[73,112],"long-term":[75,116,140],"embedding":[76],"adaptively":[78,138],"complements":[79],"it":[80],"cues,":[83],"enabling":[84],"discriminative":[85],"flexible":[87],"instance":[88],"representations.":[89],"Specifically,":[90],"SP-MOT":[91,161],"introduces:":[92],"(1)":[93],"learnable":[95],"query":[97],"globally":[99],"profiles":[100],"foreground":[101],"background":[103],"capture":[105],"scene-level":[107],"features;":[108,127],"(2)":[109],"module":[113],"uncover":[115],"stable":[117],"relationships":[118],"across":[119,166],"scenes":[121],"based":[122],"multiple":[124,167],"historical":[125],"(3)":[128],"scene-adaptive":[129],"augmented":[130],"decoding":[131],"leverages":[133],"as":[136],"guidance,":[137],"aggregating":[139],"object":[145],"embeddings,":[146],"improving":[147],"model's":[149],"association":[150],"ability":[151],"fault-tolerance.":[153],"Extensive":[154],"experiments":[155],"benchmarks":[158],"demonstrate":[159],"outperforms":[162],"state-of-the-art":[163],"metrics,":[168],"particularly":[169],"in":[170],"challenging":[171],"scenarios":[172],"dynamics.":[175]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-25T00:00:00"}
