{"id":"https://openalex.org/W4416750582","doi":"https://doi.org/10.1109/iros60139.2025.11246064","title":"MovSAM: A Single-image Moving Object Segmentation Framework Based on Deep Thinking","display_name":"MovSAM: A Single-image Moving Object Segmentation Framework Based on Deep Thinking","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416750582","doi":"https://doi.org/10.1109/iros60139.2025.11246064"},"language":null,"primary_location":{"id":"doi:10.1109/iros60139.2025.11246064","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros60139.2025.11246064","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5080056008","display_name":"Chang Nie","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chang Nie","raw_affiliation_strings":["Shanghai Jiao Tong University,School of Automation and Intelligent Sensing,Shanghai,China,200240"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,School of Automation and Intelligent Sensing,Shanghai,China,200240","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100981480","display_name":"Yiqing Xu","orcid":"https://orcid.org/0009-0004-5783-4142"},"institutions":[{"id":"https://openalex.org/I25757504","display_name":"China University of Mining and Technology","ror":"https://ror.org/01xt2dr21","country_code":"CN","type":"education","lineage":["https://openalex.org/I25757504"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yiqing Xu","raw_affiliation_strings":["China University of Mining and Technology,The Advanced Robotics Research Center, Artificial Intelligence Research Institute and School of Information and Control Engineering,Xuzhou,China,221116"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"China University of Mining and Technology,The Advanced Robotics Research Center, Artificial Intelligence Research Institute and School of Information and Control Engineering,Xuzhou,China,221116","institution_ids":["https://openalex.org/I25757504"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100336726","display_name":"Guangming Wang","orcid":"https://orcid.org/0000-0001-7180-1559"},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Guangming Wang","raw_affiliation_strings":["University of Cambridge,Department of Engineering,Cambridge,U.K.,CB2 1PZ"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Cambridge,Department of Engineering,Cambridge,U.K.,CB2 1PZ","institution_ids":["https://openalex.org/I241749"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115595442","display_name":"Zhe Liu","orcid":"https://orcid.org/0009-0004-1429-9207"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhe Liu","raw_affiliation_strings":["Shanghai Jiao Tong University,School of Automation and Intelligent Sensing,Shanghai,China,200240"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,School of Automation and Intelligent Sensing,Shanghai,China,200240","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5021924831","display_name":"Yanzi Miao","orcid":"https://orcid.org/0000-0002-2688-7477"},"institutions":[{"id":"https://openalex.org/I25757504","display_name":"China University of Mining and Technology","ror":"https://ror.org/01xt2dr21","country_code":"CN","type":"education","lineage":["https://openalex.org/I25757504"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanzi Miao","raw_affiliation_strings":["China University of Mining and Technology,The Advanced Robotics Research Center, Artificial Intelligence Research Institute and School of Information and Control Engineering,Xuzhou,China,221116"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"China University of Mining and Technology,The Advanced Robotics Research Center, Artificial Intelligence Research Institute and School of Information and Control Engineering,Xuzhou,China,221116","institution_ids":["https://openalex.org/I25757504"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32746893,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"9647","last_page":"9653"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.27399998903274536,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.27399998903274536,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.2628999948501587,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.11490000039339066,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.7249000072479248},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6638000011444092},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5690000057220459},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.5216000080108643},{"id":"https://openalex.org/keywords/market-segmentation","display_name":"Market segmentation","score":0.5174999833106995},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.4447999894618988},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.4250999987125397},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.4025000035762787}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.8087000250816345},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7724000215530396},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.7318999767303467},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.7249000072479248},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6638000011444092},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5690000057220459},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.5216000080108643},{"id":"https://openalex.org/C125308379","wikidata":"https://www.wikidata.org/wiki/Q363057","display_name":"Market segmentation","level":2,"score":0.5174999833106995},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4447999894618988},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.4250999987125397},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.4025000035762787},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.3869999945163727},{"id":"https://openalex.org/C25694479","wikidata":"https://www.wikidata.org/wiki/Q7446278","display_name":"Segmentation-based object categorization","level":5,"score":0.38359999656677246},{"id":"https://openalex.org/C65885262","wikidata":"https://www.wikidata.org/wiki/Q7429708","display_name":"Scale-space segmentation","level":4,"score":0.3449000120162964},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3377000093460083},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.30379998683929443},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2903999984264374},{"id":"https://openalex.org/C3261483","wikidata":"https://www.wikidata.org/wiki/Q119565","display_name":"Frame rate","level":2,"score":0.2777999937534332},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2694000005722046},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C52102323","wikidata":"https://www.wikidata.org/wiki/Q1671968","display_name":"Pose","level":2,"score":0.26190000772476196},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iros60139.2025.11246064","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros60139.2025.11246064","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W2076756823","https://openalex.org/W2138682569","https://openalex.org/W2470139095","https://openalex.org/W2737008123","https://openalex.org/W2916797271","https://openalex.org/W3099392664","https://openalex.org/W3170511209","https://openalex.org/W3175373394","https://openalex.org/W3202858245","https://openalex.org/W3212881378","https://openalex.org/W4226071614","https://openalex.org/W4319300199","https://openalex.org/W4386071687","https://openalex.org/W4390874575","https://openalex.org/W4402728032","https://openalex.org/W4402754134","https://openalex.org/W4402915908","https://openalex.org/W4405197804","https://openalex.org/W4414281281"],"related_works":[],"abstract_inverted_index":{"Moving":[0],"object":[1,72,91,123],"segmentation":[2,126],"plays":[3],"a":[4,44,76,116,130],"vital":[5],"role":[6],"in":[7,161,171,199],"understanding":[8,141],"dynamic":[9],"visual":[10,107],"environments.":[11],"While":[12],"existing":[13,50],"methods":[14,51,189,198],"rely":[15],"on":[16,97,213],"multi-frame":[17,188,197],"image":[18,46],"sequences":[19],"to":[20,53,87,137,157,175],"identify":[21],"moving":[22,41,71,90,122,159],"objects,":[23],"single-image":[24,70],"MOS":[25,209],"is":[26],"critical":[27],"for":[28,49,69,100,182],"applications":[29],"like":[30],"motion":[31],"intention":[32],"prediction":[33],"and":[34,92,115,146,180],"handling":[35],"camera":[36],"frame":[37],"drops.":[38],"However,":[39],"segmenting":[40],"objects":[42,160],"from":[43,109],"single":[45,162],"remains":[47],"challenging":[48],"due":[52],"the":[54,66,89,110,143,172,187,193],"absence":[55],"of":[56,142,196],"temporal":[57,201],"cues.":[58],"To":[59],"address":[60],"this":[61],"gap,":[62],"we":[63],"propose":[64],"MovSAM,":[65],"first":[67],"framework":[68],"segmentation.":[73,101,124],"MovSAM":[74,136,156,170,203],"leverages":[75],"Multimodal":[77],"Large":[78],"Language":[79],"Model":[80,113,118],"(MLLM)":[81],"enhanced":[82],"with":[83,106,149],"Chain-of-Thought":[84],"(CoT)":[85],"prompting":[86],"search":[88],"generate":[93],"text":[94],"prompts":[95,103],"based":[96],"deep":[98,131],"thinking":[99,132],"These":[102],"are":[104],"cross-fused":[105],"features":[108],"Segment":[111],"Anything":[112],"(SAM)":[114],"Vision-Language":[117],"(VLM),":[119],"enabling":[120],"logic-driven":[121],"The":[125],"results":[127],"then":[128],"undergo":[129],"refinement":[133],"loop,":[134],"allowing":[135],"iteratively":[138],"improve":[139],"its":[140,177],"scene":[144,166],"context":[145],"inter-object":[147],"relationships":[148],"logical":[150],"reasoning.":[151],"This":[152],"innovative":[153],"approach":[154],"enables":[155],"segment":[158],"images":[163],"by":[164],"considering":[165],"understanding.":[167],"We":[168],"implement":[169],"real":[173],"world":[174],"validate":[176],"practical":[178],"application":[179],"effectiveness":[181],"autonomous":[183],"driving":[184],"scenarios":[185],"where":[186],"fail.":[190],"Furthermore,":[191],"despite":[192],"inherent":[194],"advantage":[195],"utilizing":[200],"information,":[202],"achieves":[204],"state-of-the-art":[205],"performance":[206],"across":[207],"public":[208],"benchmarks,":[210],"reaching":[211],"92.5%":[212],"J&F.":[214],"Our":[215],"implementation":[216],"will":[217],"be":[218],"available":[219],"at":[220],"https://github.com/IRMVLab/MovSAM.":[221]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-11-28T00:00:00"}
