{"id":"https://openalex.org/W7152998594","doi":"https://doi.org/10.48550/arxiv.2604.07765","title":"RemoteAgent: Bridging Vague Human Intents and Earth Observation with RL-based Agentic MLLMs","display_name":"RemoteAgent: Bridging Vague Human Intents and Earth Observation with RL-based Agentic MLLMs","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7152998594","doi":"https://doi.org/10.48550/arxiv.2604.07765"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.07765","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07765","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.07765","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133352046","display_name":"Liang Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yao, Liang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133392458","display_name":"Shengxiang Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Shengxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133350448","display_name":"Fan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Fan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133317425","display_name":"Chuanyi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Chuanyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133381056","display_name":"Bishun Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Bishun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133325138","display_name":"Rui Min","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Min, Rui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133393054","display_name":"Yongjun Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yongjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133247860","display_name":"Chaoqian Ouyang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ouyang, Chaoqian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133391298","display_name":"Shimin Di","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Di, Shimin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133372611","display_name":"Min-Ling Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Min-Ling","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5133352046"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.3790000081062317,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.3790000081062317,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10689","display_name":"Remote-Sensing Image Classification","score":0.2651999890804291,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10757","display_name":"Geographic Information Systems Studies","score":0.05469999834895134,"subfield":{"id":"https://openalex.org/subfields/3305","display_name":"Geography, Planning and Development"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.7348999977111816},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5067999958992004},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.44670000672340393},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.4041000008583069},{"id":"https://openalex.org/keywords/natural-language-understanding","display_name":"Natural language understanding","score":0.39890000224113464},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.38179999589920044},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3709000051021576},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.3634999990463257}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.781000018119812},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.7348999977111816},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5067999958992004},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.44760000705718994},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.44670000672340393},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.4041000008583069},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.39890000224113464},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.38179999589920044},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3709000051021576},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3634999990463257},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.33059999346733093},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3278000056743622},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3257000148296356},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3224000036716461},{"id":"https://openalex.org/C2780385302","wikidata":"https://www.wikidata.org/wiki/Q367158","display_name":"Protocol (science)","level":3,"score":0.3215999901294708},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.2976999878883362},{"id":"https://openalex.org/C59732488","wikidata":"https://www.wikidata.org/wiki/Q2528440","display_name":"Visual analytics","level":3,"score":0.29490000009536743},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.2815000116825104},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.267300009727478},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.26660001277923584},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.266400009393692},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2653000056743622},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2639000117778778}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.07765","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07765","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.07765","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07765","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.4958731532096863,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Earth":[0],"Observation":[1],"(EO)":[2],"systems":[3],"are":[4],"essentially":[5],"designed":[6],"to":[7,68,103,141],"support":[8],"domain":[9],"experts":[10],"who":[11],"often":[12],"express":[13],"their":[14,82],"requirements":[15],"through":[16],"vague":[17,32,158],"natural":[18],"language":[19],"rather":[20],"than":[21],"precise,":[22],"machine-friendly":[23],"instructions.":[24],"Depending":[25],"on":[26],"the":[27,50,57,115,131,196],"specific":[28],"application":[29],"scenario,":[30],"these":[31],"queries":[33,55],"can":[34],"demand":[35],"vastly":[36],"different":[37],"levels":[38],"of":[39,135],"visual":[40,60],"precision.":[41],"Consequently,":[42,184],"a":[43,149,172],"practical":[44],"EO":[45,154,221],"AI":[46],"system":[47],"must":[48],"bridge":[49],"gap":[51],"between":[52],"ambiguous":[53],"human":[54],"and":[56,113,180],"appropriate":[58],"multi-granularity":[59],"analysis":[61],"tasks,":[62],"ranging":[63],"from":[64],"holistic":[65],"image":[66],"interpretation":[67],"fine-grained":[69],"pixel-wise":[70],"predictions.":[71,93,203],"While":[72],"Multi-modal":[73],"Large":[74],"Language":[75],"Models":[76],"(MLLMs)":[77],"demonstrate":[78,206],"strong":[79],"semantic":[80],"understanding,":[81],"text-based":[83],"output":[84],"format":[85],"is":[86,110],"inherently":[87],"ill-suited":[88],"for":[89,164,201],"dense,":[90],"precision-critical":[91],"spatial":[92],"Existing":[94],"agentic":[95,126],"frameworks":[96],"address":[97],"this":[98,120,139],"limitation":[99],"by":[100],"delegating":[101],"tasks":[102,155,188],"external":[104],"tools,":[105],"but":[106],"indiscriminate":[107],"tool":[108],"invocation":[109],"computationally":[111],"inefficient":[112],"underutilizes":[114],"MLLM's":[116],"native":[117],"capabilities.":[118],"To":[119,137],"end,":[121],"we":[122,146,167],"propose":[123],"RemoteAgent,":[124],"an":[125,169],"framework":[127,140],"that":[128,176,207],"strategically":[129],"respects":[130],"intrinsic":[132],"capability":[133],"boundaries":[134],"MLLMs.":[136],"empower":[138],"understand":[142],"real":[143],"user":[144],"intents,":[145],"construct":[147],"VagueEO,":[148],"human-centric":[150],"instruction":[151],"dataset":[152],"pairing":[153],"with":[156],"simulated":[157],"natural-language":[159],"queries.":[160],"By":[161],"leveraging":[162],"VagueEO":[163],"reinforcement":[165],"fine-tuning,":[166],"align":[168],"MLLM":[170],"into":[171],"robust":[173,210],"cognitive":[174],"core":[175],"directly":[177],"resolves":[178],"image-":[179],"sparse":[181],"region-level":[182],"tasks.":[183,222],"RemoteAgent":[185,208],"processes":[186],"suitable":[187],"internally":[189],"while":[190,214],"intelligently":[191],"orchestrating":[192],"specialized":[193],"tools":[194],"via":[195],"Model":[197],"Context":[198],"Protocol":[199],"exclusively":[200],"dense":[202],"Extensive":[204],"experiments":[205],"achieves":[209],"intent":[211],"recognition":[212],"capabilities":[213],"delivering":[215],"highly":[216],"competitive":[217],"performance":[218],"across":[219],"diverse":[220]},"counts_by_year":[],"updated_date":"2026-04-11T06:19:08.300824","created_date":"2026-04-11T00:00:00"}
