{"id":"https://openalex.org/W4405179038","doi":"https://doi.org/10.1109/tase.2024.3510777","title":"Listen, Perceive, Grasp: CLIP-Driven Attribute-Aware Network for Language-Conditioned Visual Segmentation and Grasping","display_name":"Listen, Perceive, Grasp: CLIP-Driven Attribute-Aware Network for Language-Conditioned Visual Segmentation and Grasping","publication_year":2024,"publication_date":"2024-12-09","ids":{"openalex":"https://openalex.org/W4405179038","doi":"https://doi.org/10.1109/tase.2024.3510777"},"language":"en","primary_location":{"id":"doi:10.1109/tase.2024.3510777","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tase.2024.3510777","pdf_url":null,"source":{"id":"https://openalex.org/S34881539","display_name":"IEEE Transactions on Automation Science and Engineering","issn_l":"1545-5955","issn":["1545-5955","1558-3783"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Automation Science and Engineering","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5039324336","display_name":"Jialong Xie","orcid":"https://orcid.org/0009-0000-0232-0049"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jialong Xie","raw_affiliation_strings":["School of Control Science and Engineering, Shandong University, Jinan, China"],"affiliations":[{"raw_affiliation_string":"School of Control Science and Engineering, Shandong University, Jinan, China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101597404","display_name":"Jin Liu","orcid":"https://orcid.org/0000-0002-5143-2263"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jin Liu","raw_affiliation_strings":["School of Control Science and Engineering, Shandong University, Jinan, China"],"affiliations":[{"raw_affiliation_string":"School of Control Science and Engineering, Shandong University, Jinan, China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018771207","display_name":"Saike Huang","orcid":"https://orcid.org/0009-0005-4932-1192"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Saike Huang","raw_affiliation_strings":["School of Control Science and Engineering, Shandong University, Jinan, China"],"affiliations":[{"raw_affiliation_string":"School of Control Science and Engineering, Shandong University, Jinan, China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100616949","display_name":"Chaoqun Wang","orcid":"https://orcid.org/0000-0001-5780-7284"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chaoqun Wang","raw_affiliation_strings":["School of Control Science and Engineering, Shandong University, Jinan, China"],"affiliations":[{"raw_affiliation_string":"School of Control Science and Engineering, Shandong University, Jinan, China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085317250","display_name":"Fengyu Zhou","orcid":"https://orcid.org/0000-0001-5140-7036"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fengyu Zhou","raw_affiliation_strings":["School of Control Science and Engineering, Shandong University, Jinan, China"],"affiliations":[{"raw_affiliation_string":"School of Control Science and Engineering, Shandong University, Jinan, China","institution_ids":["https://openalex.org/I154099455"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5039324336"],"corresponding_institution_ids":["https://openalex.org/I154099455"],"apc_list":null,"apc_paid":null,"fwci":0.245,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.559351,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"22","issue":null,"first_page":"9729","last_page":"9740"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9244999885559082,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9074000120162964,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.8825212717056274},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6451449990272522},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5910608768463135},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5899807810783386},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5412956476211548},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.4840249717235565},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3322460651397705},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.15076446533203125}],"concepts":[{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.8825212717056274},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6451449990272522},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5910608768463135},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5899807810783386},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5412956476211548},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4840249717235565},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3322460651397705},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.15076446533203125}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tase.2024.3510777","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tase.2024.3510777","pdf_url":null,"source":{"id":"https://openalex.org/S34881539","display_name":"IEEE Transactions on Automation Science and Engineering","issn_l":"1545-5955","issn":["1545-5955","1558-3783"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Automation Science and Engineering","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1484719356","display_name":null,"funder_award_id":"62103237","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G431833132","display_name":null,"funder_award_id":"ZR2021QF122","funder_id":"https://openalex.org/F4320324174","funder_display_name":"Natural Science Foundation of Shandong Province"},{"id":"https://openalex.org/G805196913","display_name":null,"funder_award_id":"2023TZXD018","funder_id":"https://openalex.org/F4320327827","funder_display_name":"Key Research and Development Project of Hainan Province"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320324174","display_name":"Natural Science Foundation of Shandong Province","ror":null},{"id":"https://openalex.org/F4320327827","display_name":"Key Research and Development Project of Hainan Province","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":54,"referenced_works":["https://openalex.org/W1892339738","https://openalex.org/W2108598243","https://openalex.org/W2123435073","https://openalex.org/W2194775991","https://openalex.org/W2953669419","https://openalex.org/W2962716343","https://openalex.org/W2962821282","https://openalex.org/W2962875890","https://openalex.org/W2963033241","https://openalex.org/W2963326767","https://openalex.org/W2963446712","https://openalex.org/W2967153639","https://openalex.org/W2981807748","https://openalex.org/W2990026901","https://openalex.org/W2993182889","https://openalex.org/W2996325525","https://openalex.org/W2998012869","https://openalex.org/W3004284873","https://openalex.org/W3035198432","https://openalex.org/W3037369911","https://openalex.org/W3090627744","https://openalex.org/W3130885760","https://openalex.org/W3169622581","https://openalex.org/W3170679572","https://openalex.org/W3186077919","https://openalex.org/W3205420310","https://openalex.org/W3216551675","https://openalex.org/W4207061897","https://openalex.org/W4210316419","https://openalex.org/W4221167790","https://openalex.org/W4285102324","https://openalex.org/W4312324773","https://openalex.org/W4312695529","https://openalex.org/W4317242584","https://openalex.org/W4366493121","https://openalex.org/W4377716582","https://openalex.org/W4381729886","https://openalex.org/W4383108836","https://openalex.org/W4385245566","https://openalex.org/W4387092545","https://openalex.org/W4389666604","https://openalex.org/W4391128534","https://openalex.org/W4401417251","https://openalex.org/W4403277141","https://openalex.org/W6723035356","https://openalex.org/W6724804524","https://openalex.org/W6771532053","https://openalex.org/W6791353385","https://openalex.org/W6791892719","https://openalex.org/W6800227314","https://openalex.org/W6801810553","https://openalex.org/W6845641797","https://openalex.org/W6851513886","https://openalex.org/W6858561505"],"related_works":["https://openalex.org/W2163296013","https://openalex.org/W2743859443","https://openalex.org/W2326995835","https://openalex.org/W165915117","https://openalex.org/W2059402478","https://openalex.org/W2123347777","https://openalex.org/W4387804363","https://openalex.org/W2019547100","https://openalex.org/W2477150073","https://openalex.org/W2515493494"],"abstract_inverted_index":{"Endowing":[0],"robots":[1,80,219,224],"with":[2],"the":[3,48,79,86,136,142,183,205,215,229,251,255,261,265,268,276,282,287,293,297,306,310,314,318,347],"ability":[4],"to":[5,58,81,98,110,134,161,202,227],"understand":[6,99],"natural":[7],"language":[8,273,300],"and":[9,55,76,84,102,114,120,140,145,163,173,192,272,302,305,312,325,341,359],"execute":[10],"grasping":[11,24,27,146,169,207,238,344,358],"is":[12,239,290],"a":[13,17,67,125,149,168,240,334],"challenging":[14],"task":[15],"in":[16,40,89,214,285,299,338,346],"human-centric":[18],"environment.":[19],"Existing":[20],"works":[21,34,248],"on":[22,30,210,263,356,363],"language-conditioned":[23,73,126,237,343],"achieve":[25],"end-to-end":[26],"detection":[28,253,316],"based":[29],"language.":[31,235,364],"However,":[32,217],"these":[33],"lack":[35],"fine-grained":[36],"visual":[37,51,74,103,115,303,320],"grounding,":[38],"resulting":[39],"cognitive":[41],"deficits":[42],"for":[43,72,243,292],"robots.":[44],"Moreover,":[45],"they":[46,259],"ignore":[47,260],"correlation":[49],"between":[50],"attributes":[52,321],"of":[53,171,176,185,204,270,275,296],"objects":[54,213],"grasping,":[56,77],"leading":[57],"coarse":[59],"grasp":[60,85,151,228,252,315,336],"poses.":[61],"To":[62],"this":[63,279,330],"end,":[64],"we":[65,93,106,130,154,353],"propose":[66],"CLIP-driven":[68],"aTtribute-aware":[69],"network":[70],"(CTNet)":[71],"segmentation":[75,127,174],"enabling":[78],"listen,":[82],"perceive,":[83],"referred":[87],"object":[88,231,298],"real-world":[90,179],"applications.":[91],"Specifically,":[92],"first":[94],"employ":[95],"Listen":[96],"stage":[97,109,133,289,308],"basic":[100],"linguistic":[101],"concepts.":[104],"Subsequently,":[105],"introduce":[107],"Perceive":[108],"mine":[111],"multi-modal":[112],"features":[113],"attribute":[116,138],"cues":[117],"(e.g.,":[118],"boundary":[119,324],"spatial":[121,143],"location),":[122],"then":[123],"yield":[124],"mask.":[128],"Further,":[129],"design":[131],"Grasp":[132],"aggregate":[135],"perceived":[137,319],"information":[139],"refine":[141],"location":[144],"rectangle,":[147],"generating":[148],"high-quality":[150],"pose.":[152],"Lastly,":[153],"provide":[155],"an":[156],"extended":[157],"large":[158],"dataset":[159,193],"Ref-OCID-Grasp":[160],"train":[162],"test":[164],"our":[165,186],"method,":[166],"achieving":[167],"accuracy":[170],"97.76%":[172],"OIoU":[175],"91.82%.":[177],"The":[178,189,246],"robotic":[180],"applications":[181],"demonstrate":[182],"effectiveness":[184],"proposed":[187,281],"approach.":[188],"project,":[190],"video,":[191],"can":[194,332],"be":[195],"found":[196],"at":[197],"<uri":[198],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[199],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">https://ctnetgrasp.github.io</uri>.":[200],"Note":[201],"Practitioners\u2014Most":[203],"existing":[206],"methods":[208],"focus":[209],"clearing":[211],"all":[212],"workspace.":[216],"as":[218,323],"integrate":[220],"into":[221],"human":[222,234],"society,":[223],"should":[225],"learn":[226],"desired":[230],"by":[232,317],"understanding":[233],"Therefore,":[236,278],"significant":[241],"skill":[242],"human-robot":[244],"collaboration.":[245],"prior":[247],"directly":[249],"complete":[250],"through":[254],"language-grasp":[256],"paradigm,":[257,284],"but":[258],"discussion":[262],"whether":[264],"robot":[266],"understands":[267],"concept":[269],"vision":[271],"expression":[274,301],"object.":[277],"paper":[280],"Listen-Perceive-Grasp":[283],"which":[286],"Listen-Perceive":[288],"responsible":[291],"conception":[294],"alignment":[295],"pixels,":[304],"Perceive-Grasp":[307],"achieves":[309],"constraining":[311],"refining":[313],"such":[322],"shape.":[326],"Experiments":[327],"show":[328],"that":[329],"method":[331],"obtain":[333],"refiner":[335],"pose":[337],"cluttered":[339],"environments":[340],"perform":[342],"well":[345],"real":[348],"world.":[349],"In":[350],"future":[351],"research,":[352],"will":[354],"work":[355],"6-DoF":[357],"multi-object":[360],"disambiguation":[361],"conditioned":[362]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
