{"id":"https://openalex.org/W4377130738","doi":"https://doi.org/10.48550/arxiv.2305.10649","title":"ZeroPrompt: Streaming Acoustic Encoders are Zero-Shot Masked LMs","display_name":"ZeroPrompt: Streaming Acoustic Encoders are Zero-Shot Masked LMs","publication_year":2023,"publication_date":"2023-05-18","ids":{"openalex":"https://openalex.org/W4377130738","doi":"https://doi.org/10.48550/arxiv.2305.10649"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2305.10649","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.10649","pdf_url":"https://arxiv.org/pdf/2305.10649","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2305.10649","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5074833963","display_name":"Xingchen Song","orcid":"https://orcid.org/0009-0009-9516-5361"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Song, Xingchen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100599890","display_name":"Di Wu","orcid":"https://orcid.org/0000-0002-4753-8161"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Di","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100449711","display_name":"Binbin Zhang","orcid":"https://orcid.org/0000-0002-1345-2220"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Binbin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041630666","display_name":"Zhendong Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Zhendong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110978167","display_name":"Bo Dang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dang, Bo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031150833","display_name":"Fuping Pan","orcid":"https://orcid.org/0000-0001-9171-0726"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Fuping","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100667025","display_name":"Zhiyong Wu","orcid":"https://orcid.org/0000-0002-6527-5502"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Zhiyong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5074833963"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9934999942779541,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.898582935333252},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7861089706420898},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7015255689620972},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.5122751593589783},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.46444064378738403},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.4302656352519989},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3938482403755188},{"id":"https://openalex.org/keywords/real-time-computing","display_name":"Real-time computing","score":0.32516637444496155},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.16725295782089233},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.11737233400344849}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.898582935333252},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7861089706420898},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7015255689620972},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.5122751593589783},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.46444064378738403},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.4302656352519989},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3938482403755188},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.32516637444496155},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.16725295782089233},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.11737233400344849},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2305.10649","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.10649","pdf_url":"https://arxiv.org/pdf/2305.10649","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2305.10649","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2305.10649","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2305.10649","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.10649","pdf_url":"https://arxiv.org/pdf/2305.10649","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.4399999976158142,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4377130738.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W4288261899","https://openalex.org/W4307309205","https://openalex.org/W2967478618","https://openalex.org/W4385009901","https://openalex.org/W4385572700","https://openalex.org/W2997152889","https://openalex.org/W4387768015","https://openalex.org/W4285141722"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3,114],"present":[4],"ZeroPrompt":[5,40,89],"(Figure":[6,13],"1-(a))":[7],"and":[8,84,93,126,139,146],"the":[9,23,58,77],"corresponding":[10],"Prompt-and-Refine":[11],"strategy":[12],"3),":[14],"two":[15],"simple":[16],"but":[17],"effective":[18],"\\textbf{training-free}":[19],"methods":[20],"to":[21,42,46,56,60,97],"decrease":[22],"Token":[24,122,133],"Display":[25,123,134],"Time":[26,124,135],"(TDT)":[27],"of":[28,39,80],"streaming":[29,72,98],"ASR":[30],"models":[31],"\\textbf{without":[32],"any":[33,102,105],"accuracy":[34,106],"loss}.":[35],"The":[36],"core":[37],"idea":[38],"is":[41,90],"append":[43],"zeroed":[44],"content":[45],"each":[47],"chunk":[48],"during":[49],"inference,":[50],"which":[51],"acts":[52],"like":[53],"a":[54],"prompt":[55],"encourage":[57],"model":[59],"predict":[61],"future":[62],"tokens":[63],"even":[64],"before":[65],"they":[66],"were":[67],"spoken.":[68],"We":[69],"argue":[70],"that":[71,88],"acoustic":[73,99],"encoders":[74,100],"naturally":[75],"have":[76],"modeling":[78],"ability":[79],"Masked":[81],"Language":[82],"Models":[83],"our":[85,111],"experiments":[86],"demonstrate":[87],"engineering":[91],"cheap":[92],"can":[94],"be":[95],"applied":[96],"on":[101,120,131,143],"dataset":[103],"without":[104],"loss.":[107],"Specifically,":[108],"compared":[109],"with":[110,137],"baseline":[112],"models,":[113],"achieve":[115],"350":[116],"$\\sim$":[117,128],"700ms":[118],"reduction":[119,130],"First":[121],"(TDT-F)":[125],"100":[127],"400ms":[129],"Last":[132],"(TDT-L),":[136],"theoretically":[138],"experimentally":[140],"equal":[141],"WER":[142],"both":[144],"Aishell-1":[145],"Librispeech":[147],"datasets.":[148]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
