{"id":"https://openalex.org/W7164520131","doi":"https://doi.org/10.1145/3802974.3809463","title":"When Drawing Is Not Enough: Exploring Spontaneous Speech with Sketch for Intent Alignment in Multimodal LLMs","display_name":"When Drawing Is Not Enough: Exploring Spontaneous Speech with Sketch for Intent Alignment in Multimodal LLMs","publication_year":2026,"publication_date":"2026-06-12","ids":{"openalex":"https://openalex.org/W7164520131","doi":"https://doi.org/10.1145/3802974.3809463"},"language":null,"primary_location":{"id":"doi:10.1145/3802974.3809463","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3802974.3809463","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Publication of the 2026 ACM Designing Interactive Systems Conference","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3802974.3809463","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5037097290","display_name":"Weiyan Shi","orcid":"https://orcid.org/0009-0001-6035-9678"},"institutions":[{"id":"https://openalex.org/I152815399","display_name":"Singapore University of Technology and Design","ror":"https://ror.org/05j6fvn87","country_code":"SG","type":"education","lineage":["https://openalex.org/I152815399"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Weiyan Shi","raw_affiliation_strings":["Singapore University of Technology and Design, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0009-0001-6035-9678","affiliations":[{"raw_affiliation_string":"Singapore University of Technology and Design, Singapore, Singapore","institution_ids":["https://openalex.org/I152815399"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069548004","display_name":"Dorien Herremans","orcid":"https://orcid.org/0000-0001-8607-1640"},"institutions":[{"id":"https://openalex.org/I152815399","display_name":"Singapore University of Technology and Design","ror":"https://ror.org/05j6fvn87","country_code":"SG","type":"education","lineage":["https://openalex.org/I152815399"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Dorien Herremans","raw_affiliation_strings":["Information Systems, Technology, and Design, Singapore University of Technology and Design, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0001-8607-1640","affiliations":[{"raw_affiliation_string":"Information Systems, Technology, and Design, Singapore University of Technology and Design, Singapore, Singapore","institution_ids":["https://openalex.org/I152815399"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5084357603","display_name":"Kenny Tsu Wei Choo","orcid":"https://orcid.org/0000-0003-3845-9143"},"institutions":[{"id":"https://openalex.org/I152815399","display_name":"Singapore University of Technology and Design","ror":"https://ror.org/05j6fvn87","country_code":"SG","type":"education","lineage":["https://openalex.org/I152815399"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Kenny Tsu Wei Choo","raw_affiliation_strings":["Singapore University of Technology and Design, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0003-3845-9143","affiliations":[{"raw_affiliation_string":"Singapore University of Technology and Design, Singapore, Singapore","institution_ids":["https://openalex.org/I152815399"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.94914067,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"445","last_page":"449"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.6324999928474426,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.6324999928474426,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.03840000182390213,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10789","display_name":"Interactive and Immersive Displays","score":0.03139999881386757,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sketch","display_name":"Sketch","score":0.7754999995231628},{"id":"https://openalex.org/keywords/multimodality","display_name":"Multimodality","score":0.3206999897956848},{"id":"https://openalex.org/keywords/legibility","display_name":"Legibility","score":0.29989999532699585},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.2750999927520752},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.2741999924182892}],"concepts":[{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.7754999995231628},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.4553999900817871},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4546000063419342},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.349700003862381},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.3206999897956848},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.31769999861717224},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3095000088214874},{"id":"https://openalex.org/C2779332521","wikidata":"https://www.wikidata.org/wiki/Q1820694","display_name":"Legibility","level":2,"score":0.29989999532699585},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.29010000824928284},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.28929999470710754},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2750999927520752},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.2741999924182892},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.2538999915122986}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3802974.3809463","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3802974.3809463","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Publication of the 2026 ACM Designing Interactive Systems Conference","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3802974.3809463","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3802974.3809463","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Publication of the 2026 ACM Designing Interactive Systems Conference","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.4813036024570465,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":10,"referenced_works":["https://openalex.org/W107955398","https://openalex.org/W2105640325","https://openalex.org/W2948085278","https://openalex.org/W4206008800","https://openalex.org/W4231268356","https://openalex.org/W4400142550","https://openalex.org/W4404782878","https://openalex.org/W4407012612","https://openalex.org/W4409720501","https://openalex.org/W7131632443"],"related_works":[],"abstract_inverted_index":{"Early-stage":[0],"design":[1,113,140],"ideation":[2],"often":[3],"relies":[4],"on":[5],"rough":[6],"sketches":[7,52,72],"created":[8],"under":[9],"time":[10],"pressure,":[11],"leaving":[12],"much":[13],"of":[14,111],"the":[15,59],"designer\u2019s":[16],"intent":[17,91,109,137],"implicit.":[18],"In":[19],"practice,":[20],"designers":[21],"frequently":[22],"speak":[23],"while":[24],"sketching,":[25],"verbally":[26],"articulating":[27],"functional":[28],"goals":[29],"and":[30,119],"ideas":[31],"that":[32,44,102,125],"are":[33,86],"difficult":[34],"to":[35,134],"express":[36],"visually.":[37],"We":[38],"introduce":[39],"TalkSketchD,":[40],"a":[41,64,93,97],"sketch-while-speaking":[42],"dataset":[43],"captures":[45],"spontaneous":[46,104],"speech":[47,76,105],"temporally":[48,126],"aligned":[49,127],"with":[50,71],"freehand":[51],"during":[53],"early-stage":[54,139],"toaster":[55],"ideation.":[56,141],"To":[57],"examine":[58],"dataset\u2019s":[60],"value,":[61],"we":[62],"conduct":[63],"sketch-to-image":[65],"generation":[66],"study":[67],"comparing":[68],"sketch-only":[69],"inputs":[70],"augmented":[73],"by":[74],"concurrent":[75],"transcripts":[77],"using":[78,92],"multimodal":[79],"large":[80],"language":[81],"models":[82],"(MLLMs).":[83],"Generated":[84],"images":[85,114],"evaluated":[87],"against":[88],"designers\u2019":[89],"self-reported":[90],"reasoning":[94],"MLLM":[95],"as":[96],"judge.":[98],"Quantitative":[99],"results":[100],"show":[101],"incorporating":[103],"significantly":[106],"improves":[107],"judged":[108],"alignment":[110],"generated":[112],"across":[115],"form,":[116],"function,":[117],"experience,":[118],"overall":[120],"intent.":[121],"These":[122],"findings":[123],"demonstrate":[124],"sketch-and-speech":[128],"data":[129],"can":[130],"enhance":[131],"MLLMs\u2019":[132],"ability":[133],"interpret":[135],"user":[136],"in":[138]},"counts_by_year":[],"updated_date":"2026-06-14T06:11:07.267592","created_date":"2026-06-13T00:00:00"}
