{"id":"https://openalex.org/W4417339939","doi":"https://doi.org/10.1145/3785009","title":"CaneSpeaker: An LLM-Assisted Speaker for Generating Human-Like Navigation Instructions","display_name":"CaneSpeaker: An LLM-Assisted Speaker for Generating Human-Like Navigation Instructions","publication_year":2025,"publication_date":"2025-12-15","ids":{"openalex":"https://openalex.org/W4417339939","doi":"https://doi.org/10.1145/3785009"},"language":"en","primary_location":{"id":"doi:10.1145/3785009","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3785009","pdf_url":null,"source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061738429","display_name":"Yuexiao Zheng","orcid":"https://orcid.org/0009-0007-1208-4220"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuanyu Zheng","raw_affiliation_strings":["School of Computer Science and Technology, Tongji University, Shanghai, China","School of Computer Science and Technology, Tongji University, China"],"raw_orcid":"https://orcid.org/0009-0007-1208-4220","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]},{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100351849","display_name":"Lin Zhang","orcid":"https://orcid.org/0000-0002-4360-5523"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lin Zhang","raw_affiliation_strings":["School of Computer Science and Technology, Tongji University, Shanghai, China","School of Computer Science and Technology, Tongji University, China"],"raw_orcid":"https://orcid.org/0000-0002-4360-5523","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]},{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100526684","display_name":"Yunda Sun","orcid":"https://orcid.org/0009-0000-3926-540X"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunda Sun","raw_affiliation_strings":["School of Computer Science and Technology, Tongji University, Shanghai, China","School of Computer Science and Technology, Tongji University, China"],"raw_orcid":"https://orcid.org/0009-0000-3926-540X","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]},{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021739501","display_name":"Ying Shen","orcid":"https://orcid.org/0000-0002-2966-7955"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Shen","raw_affiliation_strings":["School of Computer Science and Technology, Tongji University, Shanghai, China","School of Computer Science and Technology, Tongji University, China"],"raw_orcid":"https://orcid.org/0000-0002-2966-7955","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]},{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016465182","display_name":"Shengjie Zhao","orcid":"https://orcid.org/0000-0002-4301-394X"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengjie Zhao","raw_affiliation_strings":["School of Computer Science and Technology, Tongji University, Shanghai, China","School of Computer Science and Technology, Tongji University, China"],"raw_orcid":"https://orcid.org/0000-0002-4301-394X","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]},{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, China","institution_ids":["https://openalex.org/I116953780"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.36100662,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"22","issue":"1","first_page":"1","last_page":"26"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9697999954223633,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9697999954223633,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0044999998062849045,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.003599999938160181,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.6620000004768372},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.5737000107765198},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5331000089645386},{"id":"https://openalex.org/keywords/dependency","display_name":"Dependency (UML)","score":0.5002999901771545},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.33719998598098755},{"id":"https://openalex.org/keywords/scarcity","display_name":"Scarcity","score":0.31360000371932983}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9243000149726868},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.6620000004768372},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.5737000107765198},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5331000089645386},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.5002999901771545},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45680001378059387},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.34549999237060547},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.33719998598098755},{"id":"https://openalex.org/C109747225","wikidata":"https://www.wikidata.org/wiki/Q815758","display_name":"Scarcity","level":2,"score":0.31360000371932983},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3098999857902527},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.27790001034736633},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2754000127315521},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.27160000801086426},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2549999952316284}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3785009","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3785009","pdf_url":null,"source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G583652794","display_name":null,"funder_award_id":"62272343","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W3034500398","https://openalex.org/W3034578524","https://openalex.org/W3192009892","https://openalex.org/W4226052928","https://openalex.org/W4312501707","https://openalex.org/W4390873167","https://openalex.org/W4402703032"],"related_works":[],"abstract_inverted_index":{"Navigation":[0,10],"instruction":[1,88],"generation":[2],"aims":[3],"to":[4,58,72,97,113,127,178,184,212],"address":[5,49],"data":[6,20,198],"scarcity":[7,199],"in":[8,222],"Vision-and-Language":[9],"(VLN)":[11],"by":[12,103,200],"generating":[13],"navigation":[14],"instructions":[15,61,100,159,233],"for":[16,65,101,121],"unannotated":[17,220],"routes":[18,221],"from":[19,32,62,161,204,219],"sources":[21],"like":[22],"simulators":[23],"or":[24],"online":[25],"data.":[26,47],"However,":[27],"existing":[28,107],"methods":[29],"usually":[30],"suffer":[31],"high":[33],"reliance":[34],"on":[35,131,166,246],"panoramic":[36,132,167],"views,":[37,133],"poor":[38],"cross-task":[39],"generalization":[40],"ability,":[41],"and":[42,105,241,256],"limited":[43,75],"availability":[44],"of":[45,68,77,150,173,188,197],"training":[46,79,102],"To":[48],"these":[50],"challenges,":[51],"we":[52,81,134,209],"propose":[53,82,135],"a":[54,66,136,151,180,214],"novel":[55,137],"speaker,":[56],"CaneSpeaker,":[57],"generate":[59,157],"human-like":[60],"front-facing":[63,162],"images":[64,163],"variety":[67],"VLN":[69,190,206,239,243],"tasks.":[70,207],"First,":[71],"mitigate":[73],"the":[74,129,146,170,174,186,195,223,242],"amount":[76],"speaker":[78,122,141],"data,":[80],"an":[83,94,115],"Large":[84],"Language":[85],"Model":[86,139],"(LLM)-based":[87],"augmentation":[89],"method,":[90],"LLM-IA,":[91],"that":[92,229],"utilizes":[93],"off-the-shelf":[95],"LLM":[96],"create":[98],"augmented":[99,216],"distilling":[104],"reformulating":[106],"instructions.":[108],"This":[109],"method":[110],"allows":[111,176],"us":[112,177],"collect":[114],"instruction-augmented":[116],"dataset":[117],"with":[118,234],"human-level":[119],"accuracy":[120],"training,":[123],"namely":[124],"Rx2R.":[125],"Second,":[126],"eliminate":[128],"dependency":[130],"Vision-Language":[138],"(VLM)-based":[140],"architecture,":[142],"VL-Sp.":[143],"By":[144],"leveraging":[145],"advanced":[147],"reasoning":[148],"capabilities":[149],"pre-trained":[152],"VLM,":[153],"CaneSpeaker":[154,211,230],"can":[155],"effectively":[156],"high-quality":[158],"directly":[160],"without":[164],"relying":[165],"views.":[168],"Also,":[169],"prompt-based":[171],"characteristic":[172],"VLM":[175],"devise":[179],"unified":[181],"input":[182],"representation":[183],"enable":[185],"processing":[187],"multiple":[189,202],"tasks,":[191,240],"thus":[192],"further":[193],"addressing":[194],"problem":[196],"combining":[201],"datasets":[203,248,257],"different":[205],"Finally,":[208],"utilize":[210],"synthesize":[213],"large-scale":[215],"dataset,":[217],"CANE,":[218],"Matterport3D":[224],"Simulator.":[225],"Comprehensive":[226],"experiments":[227],"demonstrate":[228],"generates":[231],"precise":[232],"diverse":[235],"expressions":[236],"across":[237],"various":[238],"agent":[244],"trained":[245],"our":[247],"obviously":[249],"outperforms":[250],"its":[251],"counterparts.":[252],"The":[253],"source":[254],"codes":[255],"are":[258],"available":[259],"at":[260],"https://github.com/zheng19845/CaneSpeaker":[261],".":[262]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-12-15T00:00:00"}
