{"id":"https://openalex.org/W4389814304","doi":"https://doi.org/10.48550/arxiv.2312.08514","title":"TAM-VT: Transformation-Aware Multi-scale Video Transformer for Segmentation and Tracking","display_name":"TAM-VT: Transformation-Aware Multi-scale Video Transformer for Segmentation and Tracking","publication_year":2023,"publication_date":"2023-12-13","ids":{"openalex":"https://openalex.org/W4389814304","doi":"https://doi.org/10.48550/arxiv.2312.08514"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2312.08514","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.08514","pdf_url":"https://arxiv.org/pdf/2312.08514","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2312.08514","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5000686301","display_name":"Raghav Goyal","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Goyal, Raghav","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022911737","display_name":"Wan-Cyuan Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Wan-Cyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022335103","display_name":"Mennatullah Siam","orcid":"https://orcid.org/0000-0003-1854-3698"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Siam, Mennatullah","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5053011888","display_name":"Leonid Sigal","orcid":"https://orcid.org/0000-0002-3942-2804"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sigal, Leonid","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5000686301"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8366470336914062},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6524418592453003},{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.6293284296989441},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6002993583679199},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5175249576568604},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.5095903873443604},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.49351418018341064},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4687597453594208},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.4464928209781647},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.42770880460739136},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.39143070578575134}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8366470336914062},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6524418592453003},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.6293284296989441},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6002993583679199},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5175249576568604},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.5095903873443604},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.49351418018341064},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4687597453594208},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.4464928209781647},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.42770880460739136},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.39143070578575134},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2312.08514","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.08514","pdf_url":"https://arxiv.org/pdf/2312.08514","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2312.08514","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2312.08514","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2312.08514","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.08514","pdf_url":"https://arxiv.org/pdf/2312.08514","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/11","display_name":"Sustainable cities and communities","score":0.5099999904632568}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4389814304.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3203142394","https://openalex.org/W4390516098","https://openalex.org/W2161474341","https://openalex.org/W4302615923","https://openalex.org/W2181948922","https://openalex.org/W1974101135","https://openalex.org/W2351061015","https://openalex.org/W2366163563","https://openalex.org/W2163885456","https://openalex.org/W4242191701"],"abstract_inverted_index":{"Video":[0],"Object":[1],"Segmentation":[2],"(VOS)":[3],"has":[4],"emerged":[5],"as":[6,245,247],"an":[7,98],"increasingly":[8],"important":[9,196],"problem":[10],"with":[11,26,165,192],"availability":[12],"of":[13,47,94,106,238,253],"larger":[14],"datasets":[15,219],"and":[16,19,39,78,147,152,157,178,189,222,256],"more":[17],"complex":[18,217],"realistic":[20],"settings,":[21],"which":[22,73,123],"involve":[23],"long":[24,128,155,166],"videos":[25,156,167],"global":[27],"motion":[28],"(e.g,":[29],"in":[30,134,168],"egocentric":[31,218],"settings),":[32],"depicting":[33],"small":[34,158],"objects":[35],"undergoing":[36],"both":[37],"rigid":[38],"non-rigid":[40],"(including":[41],"state)":[42],"deformations.":[43],"While":[44],"a":[45,67,85,104,114,169],"number":[46],"recent":[48],"approaches":[49],"have":[50],"been":[51],"explored":[52],"for":[53,141,154,199],"this":[54,63],"task,":[55],"these":[56,133,203],"data":[57],"characteristics":[58],"still":[59],"present":[60],"challenges.":[61,81],"In":[62],"work":[64],"we":[65,83,112,131],"propose":[66,84,113],"novel,":[68],"clip-based":[69],"DETR-style":[70],"encoder-decoder":[71],"architecture,":[72],"focuses":[74,90],"on":[75,92,215,230,259],"systematically":[76],"analyzing":[77],"addressing":[79],"aforementioned":[80],"Specifically,":[82],"novel":[86],"transformation-aware":[87],"loss":[88],"that":[89,185],"learning":[91],"portions":[93],"the":[95,174,231,251],"video":[96,139,175],"where":[97],"object":[99],"undergoes":[100],"significant":[101],"deformations":[102],"--":[103,220],"form":[105],"\"soft\"":[107],"hard":[108],"examples":[109],"mining.":[110],"Further,":[111],"multiplicative":[115],"time-coded":[116],"memory,":[117],"beyond":[118],"vanilla":[119],"additive":[120],"positional":[121],"encoding,":[122],"helps":[124],"propagate":[125],"context":[126,180],"across":[127],"videos.":[129],"Finally,":[130],"incorporate":[132],"our":[135,207,242],"proposed":[136],"holistic":[137],"multi-scale":[138,144],"transformer":[140],"tracking":[142],"via":[143],"memory":[145,191],"matching":[146],"decoding":[148],"to":[149,209,227],"ensure":[150],"sensitivity":[151],"accuracy":[153],"objects.":[159],"Our":[160],"model":[161,208],"enables":[162],"on-line":[163],"inference":[164],"windowed":[170],"fashion,":[171],"by":[172],"breaking":[173],"into":[176,250],"clips":[177],"propagating":[179],"among":[181],"them.":[182],"We":[183],"illustrate":[184],"short":[186],"clip":[187],"length":[188],"longer":[190],"learned":[193],"time-coding":[194],"are":[195],"design":[197,243],"choices":[198,244,255],"improved":[200],"performance.":[201,260],"Collectively,":[202],"technical":[204],"contributions":[205],"enable":[206],"achieve":[210],"new":[211],"state-of-the-art":[212],"(SoTA)":[213],"performance":[214],"two":[216],"VISOR":[221],"VOST,":[223],"while":[224],"achieving":[225],"comparable":[226],"SoTA":[228],"results":[229],"conventional":[232],"VOS":[233],"benchmark,":[234],"DAVIS'17.":[235],"A":[236],"series":[237],"detailed":[239],"ablations":[240],"validate":[241],"well":[246],"provide":[248],"insights":[249],"importance":[252],"parameter":[254],"their":[257],"impact":[258]},"counts_by_year":[],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2023-12-16T00:00:00"}
