{"id":"https://openalex.org/W7133337510","doi":"https://doi.org/10.48550/arxiv.2603.00461","title":"ReMoT: Reinforcement Learning with Motion Contrast Triplets","display_name":"ReMoT: Reinforcement Learning with Motion Contrast Triplets","publication_year":2026,"publication_date":"2026-02-28","ids":{"openalex":"https://openalex.org/W7133337510","doi":"https://doi.org/10.48550/arxiv.2603.00461"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00461","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00461","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00461","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5127935864","display_name":"Cong Wan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wan, Cong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127916528","display_name":"Zeyu Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Zeyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128027125","display_name":"Jiangyang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jiangyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127982591","display_name":"SongLin Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, SongLin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127943556","display_name":"Yifan Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Yifan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054572600","display_name":"Lin Peng","orcid":"https://orcid.org/0009-0008-1786-1946"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Lin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128006293","display_name":"Zhiheng Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Zhiheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128002114","display_name":"Yihong Gong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Yihong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5127935864"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5573999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5573999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.11209999769926071,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.038600001484155655,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.771399974822998},{"id":"https://openalex.org/keywords/contrast","display_name":"Contrast (vision)","score":0.6717000007629395},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.6363000273704529},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.6197999715805054},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6093999743461609},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5612000226974487},{"id":"https://openalex.org/keywords/measure","display_name":"Measure (data warehouse)","score":0.5494999885559082}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.771399974822998},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.714900016784668},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.6717000007629395},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6506999731063843},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.6363000273704529},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.6197999715805054},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6093999743461609},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5612000226974487},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.5494999885559082},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.49959999322891235},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.45890000462532043},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.4163999855518341},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.32010000944137573},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.31459999084472656},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2915000021457672},{"id":"https://openalex.org/C21080849","wikidata":"https://www.wikidata.org/wiki/Q13611879","display_name":"Data point","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.28110000491142273},{"id":"https://openalex.org/C2780624872","wikidata":"https://www.wikidata.org/wiki/Q852453","display_name":"Motion detection","level":3,"score":0.28110000491142273},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.257099986076355}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00461","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00461","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00461","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00461","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.5812684297561646,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,83],"present":[1],"ReMoT,":[2],"a":[3,19,42,96,123],"unified":[4],"training":[5],"paradigm":[6],"to":[7,94],"systematically":[8],"address":[9],"the":[10,86],"fundamental":[11],"shortcomings":[12],"of":[13,99],"VLMs":[14],"in":[15,23,122],"spatio-temporal":[16,129],"consistency":[17],"--":[18],"critical":[20],"failure":[21],"point":[22],"navigation,":[24],"robotics,":[25],"and":[26,70,116],"autonomous":[27],"driving.":[28],"ReMoT":[29],"integrates":[30],"two":[31],"core":[32],"components:":[33],"(1)":[34],"A":[35],"rule-based":[36],"automatic":[37],"framework":[38],"that":[39],"generates":[40],"ReMoT-16K,":[41],"large-scale":[43],"(16.5K":[44],"triplets)":[45],"motion-contrast":[46],"dataset":[47],"derived":[48],"from":[49],"video":[50],"meta-annotations,":[51],"surpassing":[52],"costly":[53],"manual":[54],"or":[55],"model-based":[56],"generation.":[57],"(2)":[58],"Group":[59],"Relative":[60],"Policy":[61],"Optimization,":[62],"which":[63],"we":[64],"empirically":[65],"validate":[66],"yields":[67],"optimal":[68],"performance":[69,111,126],"data":[71],"efficiency":[72],"for":[73,89],"learning":[74],"this":[75],"contrastive":[76],"reasoning,":[77],"far":[78],"exceeding":[79],"standard":[80,118],"Supervised":[81],"Fine-Tuning.":[82],"also":[84],"construct":[85],"first":[87],"benchmark":[88,115],"fine-grained":[90],"motion":[91,101],"contrast":[92],"triplets":[93],"measure":[95],"VLM's":[97],"discrimination":[98],"subtle":[100],"attributes":[102],"(e.g.,":[103],"opposing":[104],"directions).":[105],"The":[106],"resulting":[107],"model":[108],"achieves":[109],"state-of-the-art":[110],"on":[112,128],"our":[113],"new":[114],"multiple":[117],"VLM":[119],"benchmarks,":[120],"culminating":[121],"remarkable":[124],"25.1%":[125],"leap":[127],"reasoning":[130],"tasks.":[131]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
