{"id":"https://openalex.org/W4416751058","doi":"https://doi.org/10.1109/iros60139.2025.11246718","title":"BookBot: A Robotic Manipulation Benchmark for Voice-Driven Book Recognition and Grasping in Cluttered Environments","display_name":"BookBot: A Robotic Manipulation Benchmark for Voice-Driven Book Recognition and Grasping in Cluttered Environments","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416751058","doi":"https://doi.org/10.1109/iros60139.2025.11246718"},"language":null,"primary_location":{"id":"doi:10.1109/iros60139.2025.11246718","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros60139.2025.11246718","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103036207","display_name":"Huaqiang Wang","orcid":"https://orcid.org/0000-0003-1527-1439"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huaqiang Wang","raw_affiliation_strings":["Tsinghua University,Department of Electronic Engineering"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Electronic Engineering","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101091323","display_name":"Yuan Wang","orcid":"https://orcid.org/0009-0007-8604-6243"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuan Wang","raw_affiliation_strings":["Tsinghua University,Department of Electronic Engineering"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Electronic Engineering","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026630522","display_name":"Xiang Li","orcid":"https://orcid.org/0000-0003-0945-145X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiang Li","raw_affiliation_strings":["Tsinghua University,Department of Electronic Engineering"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Electronic Engineering","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100346266","display_name":"Yali Li","orcid":"https://orcid.org/0000-0002-6629-7228"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yali Li","raw_affiliation_strings":["Tsinghua University,Department of Electronic Engineering"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Electronic Engineering","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5113052507","display_name":"Shengjin Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengjin Wang","raw_affiliation_strings":["Tsinghua University,Department of Electronic Engineering"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Electronic Engineering","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.3943138,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"20960","last_page":"20967"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.4503999948501587,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.4503999948501587,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.11829999834299088,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.0706000030040741,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6940000057220459},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.670199990272522},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6474999785423279},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.6240000128746033},{"id":"https://openalex.org/keywords/robotics","display_name":"Robotics","score":0.6087999939918518},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.5889999866485596},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.44369998574256897},{"id":"https://openalex.org/keywords/automation","display_name":"Automation","score":0.44279998540878296},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.4408999979496002}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7006000280380249},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6940000057220459},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6736999750137329},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.670199990272522},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6474999785423279},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.6240000128746033},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.6087999939918518},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.5889999866485596},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.44369998574256897},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.44279998540878296},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.4408999979496002},{"id":"https://openalex.org/C144559511","wikidata":"https://www.wikidata.org/wiki/Q2986279","display_name":"Principal (computer security)","level":2,"score":0.4318000078201294},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.42739999294281006},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4072999954223633},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.38850000500679016},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.367900013923645},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.33480000495910645},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.32339999079704285},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32100000977516174},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.28630000352859497},{"id":"https://openalex.org/C2780626000","wikidata":"https://www.wikidata.org/wiki/Q5936775","display_name":"Human-in-the-loop","level":2,"score":0.28119999170303345},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.26579999923706055},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.2646999955177307},{"id":"https://openalex.org/C112313634","wikidata":"https://www.wikidata.org/wiki/Q7886648","display_name":"Complement (music)","level":5,"score":0.2590999901294708},{"id":"https://openalex.org/C2776650193","wikidata":"https://www.wikidata.org/wiki/Q264661","display_name":"Obstacle","level":2,"score":0.25600001215934753},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iros60139.2025.11246718","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros60139.2025.11246718","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320322392","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W2110764733","https://openalex.org/W2128728535","https://openalex.org/W2135249503","https://openalex.org/W2194187530","https://openalex.org/W2194775991","https://openalex.org/W2565639579","https://openalex.org/W2740924709","https://openalex.org/W2952122856","https://openalex.org/W2963150697","https://openalex.org/W3034792612","https://openalex.org/W3035198432","https://openalex.org/W3039298230","https://openalex.org/W3090814639","https://openalex.org/W3092588809","https://openalex.org/W3129478823","https://openalex.org/W3131716792","https://openalex.org/W3132607695","https://openalex.org/W3175618949","https://openalex.org/W3177008250","https://openalex.org/W3187686865","https://openalex.org/W3207187156","https://openalex.org/W4283359318","https://openalex.org/W4382366145","https://openalex.org/W4386083144","https://openalex.org/W4386113267","https://openalex.org/W4389666118","https://openalex.org/W4389667112","https://openalex.org/W4401413802","https://openalex.org/W4401415287","https://openalex.org/W4402354047","https://openalex.org/W4402716457","https://openalex.org/W4405785057","https://openalex.org/W4405785545","https://openalex.org/W4405786528","https://openalex.org/W4405786713","https://openalex.org/W4414050913"],"related_works":[],"abstract_inverted_index":{"Books,":[0],"as":[1,7,9],"enduring":[2],"repositories":[3],"of":[4,47,59,64,179,188,199],"cultural":[5],"heritage":[6],"well":[8],"knowledge,":[10],"play":[11],"a":[12,116,165],"fundamental":[13],"role":[14],"in":[15,20,27,71,142],"human":[16],"development.":[17],"Although":[18],"advances":[19],"embodied":[21],"AI":[22],"and":[23,31,56,69,103,107,126,139,186],"robotics":[24],"revolutionize":[25],"automation":[26],"domains,":[28],"e.g.,":[29],"manufacturing":[30],"logistics,":[32],"robotic":[33,53,172],"book":[34,54,92,101,119,128],"manipulation":[35,70,120,167],"remains":[36],"an":[37,148],"underexplored":[38],"frontier.":[39],"Two":[40],"primary":[41],"bottlenecks":[42],"impede":[43],"progress:":[44],"(1)":[45],"scarcity":[46],"fine-grained":[48],"annotated":[49],"datasets":[50],"for":[51],"benchmarking":[52],"manipulation,":[55],"(2)":[57],"lack":[58],"unified":[60],"perception-action":[61],"frameworks":[62],"capable":[63],"dynamically":[65],"coupling":[66],"multi-modal":[67],"sensing":[68],"real-world":[72],"scenarios.":[73],"To":[74],"these":[75],"issues,":[76],"we":[77,113,131,163],"present":[78],"THU-Book,":[79],"the":[80,171,176,180,184,193,197],"first":[81],"open-access":[82],"benchmark":[83,195],"featuring":[84],"643":[85],"3D":[86],"scene":[87],"captures,":[88],"encompassing":[89],"11,298":[90],"high-fidelity":[91],"instances":[93],"with":[94,153],"rich":[95],"annotations":[96],"to":[97,105,122,137,156,159,169],"support":[98,123],"tasks":[99],"from":[100],"recognition":[102],"localization":[104],"grasping":[106],"repositioning.":[108],"Building":[109],"upon":[110],"this":[111],"foundation,":[112],"develop":[114],"BookBot,":[115],"novel":[117],"voice-interactive":[118],"pipeline":[121],"cross-environmental,":[124],"multilingual,":[125],"multi-categorical":[127],"manipulation.":[129],"First,":[130],"utilize":[132],"Large":[133],"Language":[134],"Models":[135],"(LLMs)":[136],"parse":[138],"comprehend":[140],"ambiguity":[141],"user":[143],"instructions.":[144],"We":[145],"further":[146],"propose":[147],"instance":[149],"segmentation":[150],"module":[151],"combined":[152],"OCR":[154],"tool":[155],"link":[157],"language":[158],"visual":[160],"instances.":[161],"Finally,":[162],"introduce":[164],"PCA-based":[166],"policy":[168],"refine":[170],"grasp":[173],"pose,":[174],"utilizing":[175],"principal":[177],"components":[178],"books\u2019":[181],"geometry,":[182],"improving":[183],"precision":[185],"efficiency":[187],"grasping.":[189],"Experiments":[190],"conducted":[191],"on":[192],"THU-Book":[194],"validate":[196],"effectiveness":[198],"our":[200],"BookBot.":[201],"The":[202],"dataset":[203],"is":[204],"available":[205],"at":[206],"https://github.com/wanghq-public/BookBot.":[207]},"counts_by_year":[],"updated_date":"2026-06-26T08:34:08.712188","created_date":"2025-11-28T00:00:00"}
