{"id":"https://openalex.org/W7137307158","doi":"https://doi.org/10.48550/arxiv.2603.12628","title":"Towards unified brain-to-text decoding across speech production and perception","display_name":"Towards unified brain-to-text decoding across speech production and perception","publication_year":2026,"publication_date":"2026-03-13","ids":{"openalex":"https://openalex.org/W7137307158","doi":"https://doi.org/10.48550/arxiv.2603.12628"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.12628","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12628","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.12628","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129440107","display_name":"Zhizhang Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yuan, Zhizhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129622667","display_name":"Yang Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129637726","display_name":"Gaorui Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Gaorui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129434783","display_name":"Baowen Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Baowen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129453874","display_name":"Zehan Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Zehan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129426891","display_name":"Yuhao Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Yuhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129587258","display_name":"Xiaoying Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Xiaoying","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129545842","display_name":"Liang Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Liang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107985523","display_name":"Ying Mao","orcid":"https://orcid.org/0000-0001-8055-115X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mao, Ying","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5110943100","display_name":"Meng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Meng","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5129440107"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10465","display_name":"Neurobiology of Language and Bilingualism","score":0.9235000014305115,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10465","display_name":"Neurobiology of Language and Bilingualism","score":0.9235000014305115,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.01979999989271164,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10730","display_name":"Language Development and Disorders","score":0.009399999864399433,"subfield":{"id":"https://openalex.org/subfields/3204","display_name":"Developmental and Educational Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.7991999983787537},{"id":"https://openalex.org/keywords/mandarin-chinese","display_name":"Mandarin Chinese","score":0.713100016117096},{"id":"https://openalex.org/keywords/speech-production","display_name":"Speech production","score":0.5697000026702881},{"id":"https://openalex.org/keywords/neural-decoding","display_name":"Neural decoding","score":0.5454000234603882},{"id":"https://openalex.org/keywords/syllable","display_name":"Syllable","score":0.5228999853134155},{"id":"https://openalex.org/keywords/production","display_name":"Production (economics)","score":0.4648999869823456},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.44609999656677246},{"id":"https://openalex.org/keywords/speech-perception","display_name":"Speech perception","score":0.44200000166893005}],"concepts":[{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.7991999983787537},{"id":"https://openalex.org/C138954614","wikidata":"https://www.wikidata.org/wiki/Q9192","display_name":"Mandarin Chinese","level":2,"score":0.713100016117096},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.661300003528595},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6547999978065491},{"id":"https://openalex.org/C43617652","wikidata":"https://www.wikidata.org/wiki/Q7575399","display_name":"Speech production","level":2,"score":0.5697000026702881},{"id":"https://openalex.org/C40743351","wikidata":"https://www.wikidata.org/wiki/Q7002049","display_name":"Neural decoding","level":3,"score":0.5454000234603882},{"id":"https://openalex.org/C109089402","wikidata":"https://www.wikidata.org/wiki/Q8188","display_name":"Syllable","level":2,"score":0.5228999853134155},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.4648999869823456},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.44609999656677246},{"id":"https://openalex.org/C99209842","wikidata":"https://www.wikidata.org/wiki/Q643696","display_name":"Speech perception","level":3,"score":0.44200000166893005},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.40779998898506165},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3831999897956848},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.38280001282691956},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3804999887943268},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.36399999260902405},{"id":"https://openalex.org/C2776264592","wikidata":"https://www.wikidata.org/wiki/Q463837","display_name":"Language production","level":3,"score":0.3617999851703644},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.34790000319480896},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3474000096321106},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.34290000796318054},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3192000091075897},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3034000098705292},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.2696000039577484},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.2678999900817871}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.12628","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12628","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.12628","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12628","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7366253733634949,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Speech":[0],"production":[1,36,158,162,223],"and":[2,22,37,57,60,70,92,124,159,192,224,235],"perception":[3,38,184],"are":[4],"the":[5,205,217,237],"main":[6],"ways":[7],"humans":[8],"communicate":[9],"daily.":[10],"Prior":[11],"brain-to-text":[12,230],"decoding":[13,31,50,193,210,231,242],"studies":[14],"have":[15],"largely":[16],"focused":[17],"on":[18,54,129],"a":[19,28,99,121,130,186,208],"single":[20],"modality":[21],"alphabetic":[23],"languages.":[24],"Here,":[25],"we":[26,119],"present":[27],"unified":[29,209],"brain-to-sentence":[30],"framework":[32,43,127,211],"for":[33],"both":[34,176],"speech":[35,79,157,161,183,222],"in":[39,87,155,232],"Mandarin":[40,78,156,221],"Chinese.":[41],"The":[42],"exhibits":[44],"strong":[45],"generalization":[46],"ability,":[47],"enabling":[48],"sentence-level":[49],"when":[51],"trained":[52],"only":[53,203],"single-character":[55],"data":[56],"supporting":[58,244],"characters":[59],"syllables":[61,111],"unseen":[62],"during":[63],"training.":[64],"In":[65,149],"addition,":[66,150],"it":[67],"allows":[68],"direct":[69],"controlled":[71],"comparison":[72],"of":[73,108,143,145,207,220],"neural":[74,95,164,218,240],"dynamics":[75],"across":[76,166,198],"modalities.":[77,246],"is":[80],"decoded":[81],"by":[82,98],"first":[83],"classifying":[84],"syllable":[85],"components":[86],"Hanyu":[88],"Pinyin,":[89],"namely":[90],"initials":[91],"finals,":[93],"from":[94],"signals,":[96],"followed":[97],"post-trained":[100],"large":[101],"language":[102,241],"model":[103],"(LLM)":[104],"that":[105,136],"maps":[106],"sequences":[107],"toneless":[109],"Pinyin":[110],"to":[112,175,190,229],"Chinese":[113],"sentences.":[114],"To":[115],"enhance":[116],"LLM":[117],"decoding,":[118],"designed":[120],"three-stage":[122],"post-training":[123],"two-stage":[125],"inference":[126],"based":[128],"7-billion-parameter":[131],"LLM,":[132],"achieving":[133],"overall":[134],"performance":[135,194],"exceeds":[137],"larger":[138],"commercial":[139],"LLMs":[140],"with":[141,182],"hundreds":[142],"billions":[144],"parameters":[146],"or":[147],"more.":[148],"several":[151],"characteristics":[152,219],"were":[153],"observed":[154],"perception:":[160],"involved":[163],"responses":[165],"broader":[167],"cortical":[168],"regions":[169],"than":[170],"auditory":[171],"perception;":[172],"channels":[173],"responsive":[174],"modalities":[177],"exhibited":[178],"similar":[179],"activity":[180],"patterns,":[181],"showing":[185],"temporal":[187],"delay":[188],"relative":[189],"production;":[191],"was":[195],"broadly":[196],"comparable":[197],"hemispheres.":[199],"Our":[200],"work":[201],"not":[202],"establishes":[204],"feasibility":[206],"but":[212],"also":[213],"provides":[214],"insights":[215],"into":[216],"perception.":[225],"These":[226],"advances":[227],"contribute":[228],"logosyllabic":[233],"languages":[234],"pave":[236],"way":[238],"toward":[239],"systems":[243],"multiple":[245]},"counts_by_year":[],"updated_date":"2026-03-17T07:05:13.627479","created_date":"2026-03-17T00:00:00"}
