{"id":"https://openalex.org/W4405785095","doi":"https://doi.org/10.1109/iros58592.2024.10802596","title":"Object Segmentation from Open-Vocabulary Manipulation Instructions Based on Optimal Transport Polygon Matching with Multimodal Foundation Models","display_name":"Object Segmentation from Open-Vocabulary Manipulation Instructions Based on Optimal Transport Polygon Matching with Multimodal Foundation Models","publication_year":2024,"publication_date":"2024-10-14","ids":{"openalex":"https://openalex.org/W4405785095","doi":"https://doi.org/10.1109/iros58592.2024.10802596"},"language":"en","primary_location":{"id":"doi:10.1109/iros58592.2024.10802596","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros58592.2024.10802596","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111249512","display_name":"Takayuki Nishimura","orcid":null},"institutions":[{"id":"https://openalex.org/I203951103","display_name":"Keio University","ror":"https://ror.org/02kn6nx58","country_code":"JP","type":"education","lineage":["https://openalex.org/I203951103"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Takayuki Nishimura","raw_affiliation_strings":["Keio University,Kanagawa,Japan,223-8522"],"affiliations":[{"raw_affiliation_string":"Keio University,Kanagawa,Japan,223-8522","institution_ids":["https://openalex.org/I203951103"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5094179876","display_name":"Katsuyuki Kuyo","orcid":"https://orcid.org/0009-0003-5746-9387"},"institutions":[{"id":"https://openalex.org/I203951103","display_name":"Keio University","ror":"https://ror.org/02kn6nx58","country_code":"JP","type":"education","lineage":["https://openalex.org/I203951103"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Katsuyuki Kuyo","raw_affiliation_strings":["Keio University,Kanagawa,Japan,223-8522"],"affiliations":[{"raw_affiliation_string":"Keio University,Kanagawa,Japan,223-8522","institution_ids":["https://openalex.org/I203951103"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030254193","display_name":"Motonari Kambara","orcid":"https://orcid.org/0000-0002-1991-9119"},"institutions":[{"id":"https://openalex.org/I203951103","display_name":"Keio University","ror":"https://ror.org/02kn6nx58","country_code":"JP","type":"education","lineage":["https://openalex.org/I203951103"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Motonari Kambara","raw_affiliation_strings":["Keio University,Kanagawa,Japan,223-8522"],"affiliations":[{"raw_affiliation_string":"Keio University,Kanagawa,Japan,223-8522","institution_ids":["https://openalex.org/I203951103"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5033744547","display_name":"Komei Sugiura","orcid":"https://orcid.org/0000-0002-0261-0510"},"institutions":[{"id":"https://openalex.org/I203951103","display_name":"Keio University","ror":"https://ror.org/02kn6nx58","country_code":"JP","type":"education","lineage":["https://openalex.org/I203951103"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Komei Sugiura","raw_affiliation_strings":["Keio University,Kanagawa,Japan,223-8522"],"affiliations":[{"raw_affiliation_string":"Keio University,Kanagawa,Japan,223-8522","institution_ids":["https://openalex.org/I203951103"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5111249512"],"corresponding_institution_ids":["https://openalex.org/I203951103"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.24930569,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"9549","last_page":"9556"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8889999985694885,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8889999985694885,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10586","display_name":"Robotic Path Planning Algorithms","score":0.8385000228881836,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.8227999806404114,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7721874117851257},{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.7218724489212036},{"id":"https://openalex.org/keywords/polygon","display_name":"Polygon (computer graphics)","score":0.7135862112045288},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5855126976966858},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4905490279197693},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4844832122325897},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4815123975276947},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.4273979067802429},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.41485464572906494},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.39094918966293335},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3247002959251404},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.0718056857585907},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.07078048586845398},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.0600227415561676}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7721874117851257},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.7218724489212036},{"id":"https://openalex.org/C190694206","wikidata":"https://www.wikidata.org/wiki/Q3276654","display_name":"Polygon (computer graphics)","level":3,"score":0.7135862112045288},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5855126976966858},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4905490279197693},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4844832122325897},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4815123975276947},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.4273979067802429},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.41485464572906494},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.39094918966293335},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3247002959251404},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0718056857585907},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.07078048586845398},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0600227415561676},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.0},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iros58592.2024.10802596","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros58592.2024.10802596","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":54,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2251512949","https://openalex.org/W2302548814","https://openalex.org/W2489434015","https://openalex.org/W2962716343","https://openalex.org/W2963109634","https://openalex.org/W2964339842","https://openalex.org/W2964345792","https://openalex.org/W2980088508","https://openalex.org/W3034325957","https://openalex.org/W3034578524","https://openalex.org/W3034758614","https://openalex.org/W3035097537","https://openalex.org/W3035709993","https://openalex.org/W3090449556","https://openalex.org/W3094502228","https://openalex.org/W3120596229","https://openalex.org/W3159619744","https://openalex.org/W3180134609","https://openalex.org/W3193171560","https://openalex.org/W3216551675","https://openalex.org/W4200631575","https://openalex.org/W4214490042","https://openalex.org/W4221167437","https://openalex.org/W4224912544","https://openalex.org/W4225832925","https://openalex.org/W4309181071","https://openalex.org/W4383097638","https://openalex.org/W4385431288","https://openalex.org/W4386065742","https://openalex.org/W4386065815","https://openalex.org/W4386075493","https://openalex.org/W4386076142","https://openalex.org/W4389666568","https://openalex.org/W4389666751","https://openalex.org/W4390691239","https://openalex.org/W4390874575","https://openalex.org/W4399666346","https://openalex.org/W4403448419","https://openalex.org/W6682962330","https://openalex.org/W6687483927","https://openalex.org/W6779823529","https://openalex.org/W6791353385","https://openalex.org/W6811013733","https://openalex.org/W6846512445","https://openalex.org/W6849317621","https://openalex.org/W6850503672","https://openalex.org/W6851607685","https://openalex.org/W6853116092","https://openalex.org/W6853534422","https://openalex.org/W6854555012","https://openalex.org/W6856810218","https://openalex.org/W6857785731","https://openalex.org/W6858836066"],"related_works":["https://openalex.org/W2381393187","https://openalex.org/W2332779545","https://openalex.org/W2358060160","https://openalex.org/W2035483685","https://openalex.org/W1969764885","https://openalex.org/W596947562","https://openalex.org/W2793937822","https://openalex.org/W2790817834","https://openalex.org/W2777605427","https://openalex.org/W2501983714"],"abstract_inverted_index":{"We":[0,82],"consider":[1],"the":[2,9,40,49,57,96,104,118,127,130,148],"task":[3],"of":[4,43,51,98,129],"generating":[5],"segmentation":[6,30,76],"masks":[7,77],"for":[8,37],"target":[10],"object":[11,14],"from":[12,78],"an":[13],"manipulation":[15],"instruction,":[16],"which":[17,48,60],"allows":[18],"users":[19],"to":[20,25,35,62,91],"give":[21],"open":[22,79],"vocabulary":[23,80],"instructions":[24],"domestic":[26],"service":[27],"robots.":[28],"Conventional":[29],"generation":[31,137],"approaches":[32],"often":[33],"fail":[34],"account":[36],"objects":[38],"outside":[39],"camera\u2019s":[41],"field":[42],"view":[44],"and":[45,121],"cases":[46],"in":[47],"order":[50,97],"vertices":[52,99],"differs":[53,100],"but":[54,101],"still":[55,102],"represents":[56,103],"same":[58,105],"polygon,":[59],"leads":[61],"erroneous":[63],"mask":[64,136],"generation.":[65],"In":[66],"this":[67],"study,":[68],"we":[69,111],"propose":[70],"a":[71,84,113,144,152],"novel":[72,85],"method":[73,132],"that":[74],"generates":[75],"instructions.":[81],"implement":[83],"loss":[86,94],"function":[87],"using":[88],"optimal":[89],"transport":[90],"prevent":[92],"significant":[93],"where":[95],"polygon.":[106],"To":[107],"evaluate":[108],"our":[109,140],"approach,":[110],"constructed":[112],"new":[114],"dataset":[115,120,149],"based":[116],"on":[117,147],"REVERIE":[119],"Matterport3D":[122],"dataset.":[123],"The":[124],"results":[125],"demonstrated":[126],"effectiveness":[128],"proposed":[131],"compared":[133,150],"with":[134,151],"existing":[135],"methods.":[138],"Remarkably,":[139],"best":[141],"model":[142],"achieved":[143],"+16.32%":[145],"improvement":[146],"representative":[153],"polygon-based":[154],"method.":[155]},"counts_by_year":[],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
