{"id":"https://openalex.org/W7134916114","doi":"https://doi.org/10.48550/arxiv.2603.09740","title":"Let's Reward Step-by-Step: Step-Aware Contrastive Alignment for Vision-Language Navigation in Continuous Environments","display_name":"Let's Reward Step-by-Step: Step-Aware Contrastive Alignment for Vision-Language Navigation in Continuous Environments","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7134916114","doi":"https://doi.org/10.48550/arxiv.2603.09740"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.09740","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09740","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.09740","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128763697","display_name":"Haoyuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Haoyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128747083","display_name":"Rui Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Rui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128690727","display_name":"Hehe Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Hehe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128736700","display_name":"Yi Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5128763697"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9688000082969666,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9688000082969666,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.00570000009611249,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0032999999821186066,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5443999767303467},{"id":"https://openalex.org/keywords/imperfect","display_name":"Imperfect","score":0.4844000041484833},{"id":"https://openalex.org/keywords/resampling","display_name":"Resampling","score":0.4368000030517578},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.4075999855995178},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.4034999907016754},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.3644999861717224},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.34869998693466187},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.34869998693466187}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7685999870300293},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6090999841690063},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5443999767303467},{"id":"https://openalex.org/C2780310539","wikidata":"https://www.wikidata.org/wiki/Q12547192","display_name":"Imperfect","level":2,"score":0.4844000041484833},{"id":"https://openalex.org/C150921843","wikidata":"https://www.wikidata.org/wiki/Q1170431","display_name":"Resampling","level":2,"score":0.4368000030517578},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.4075999855995178},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.4034999907016754},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38429999351501465},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3644999861717224},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.34869998693466187},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.34869998693466187},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.3440999984741211},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.32739999890327454},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3237000107765198},{"id":"https://openalex.org/C193319292","wikidata":"https://www.wikidata.org/wiki/Q272172","display_name":"Hamming distance","level":2,"score":0.32280001044273376},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.31769999861717224},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.3116999864578247},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.3066999912261963},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.30230000615119934},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.2766000032424927},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.25999999046325684}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.09740","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09740","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.09740","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09740","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Navigation":[1],"in":[2,84],"Continuous":[3],"Environments":[4],"(VLN-CE)":[5],"requires":[6],"agents":[7],"to":[8,30,50,73,76,80,101,138],"learn":[9],"complex":[10],"reasoning":[11],"from":[12,43,46,52,105],"long-horizon":[13],"human":[14],"interactions.":[15],"While":[16],"Multi-modal":[17],"Large":[18],"Language":[19],"Models":[20],"(MLLMs)":[21],"have":[22],"driven":[23],"recent":[24],"progress,":[25],"current":[26],"training":[27,37],"paradigms":[28],"struggle":[29],"balance":[31],"generalization":[32],"capability,":[33],"error":[34],"recovery":[35],"and":[36,55,124,141],"stability.":[38],"Specifically,":[39],"(i)":[40],"policies":[41],"derived":[42],"SFT":[44],"suffer":[45],"compounding":[47],"errors,":[48],"struggling":[49],"recover":[51],"out-of-distribution":[53],"states,":[54],"(ii)":[56],"Reinforcement":[57],"Fine-Tuning":[58],"(RFT)":[59],"methods":[60],"e.g.":[61],"GRPO":[62],"are":[63],"bottlenecked":[64],"by":[65],"sparse":[66],"outcome":[67],"rewards.":[68],"Their":[69],"binary":[70],"feedback":[71],"fails":[72],"assign":[74],"credit":[75],"individual":[77],"steps,":[78],"leading":[79],"gradient":[81],"signal":[82],"collapse":[83],"failure":[85],"dominant":[86],"batches.":[87],"To":[88],"address":[89],"these":[90,129],"challenges,":[91],"we":[92],"introduce":[93],"Step-Aware":[94,113],"Contrastive":[95],"Alignment":[96],"(SACA),":[97],"a":[98],"framework":[99],"designed":[100],"extract":[102],"dense":[103],"supervision":[104],"imperfect":[106],"trajectories.":[107],"At":[108],"its":[109],"core,":[110],"the":[111],"Perception-Grounded":[112],"auditor":[114],"evaluates":[115],"progress":[116],"step-by-step,":[117],"disentangling":[118],"failed":[119],"trajectories":[120],"into":[121],"valid":[122],"prefixes":[123],"exact":[125],"divergence":[126],"points.":[127],"Leveraging":[128],"signals,":[130],"Scenario-Conditioned":[131],"Group":[132],"Construction":[133],"mechanism":[134],"dynamically":[135],"routes":[136],"batches":[137],"specialized":[139],"resampling":[140],"optimization":[142],"strategies.":[143],"Extensive":[144],"experiments":[145],"on":[146],"VLN-CE":[147],"benchmarks":[148],"demonstrate":[149],"that":[150],"SACA":[151],"achieves":[152],"state-of-the-art":[153],"performance.":[154]},"counts_by_year":[],"updated_date":"2026-03-12T06:18:43.230356","created_date":"2026-03-12T00:00:00"}
