{"id":"https://openalex.org/W4372262650","doi":"https://doi.org/10.1109/icassp49357.2023.10096662","title":"I3D: Transformer Architectures with Input-Dependent Dynamic Depth for Speech Recognition","display_name":"I3D: Transformer Architectures with Input-Dependent Dynamic Depth for Speech Recognition","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372262650","doi":"https://doi.org/10.1109/icassp49357.2023.10096662"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096662","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096662","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033862822","display_name":"Yifan Peng","orcid":"https://orcid.org/0000-0002-8581-8674"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yifan Peng","raw_affiliation_strings":["Carnegie Mellon University"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015578080","display_name":"Jaesong Lee","orcid":null},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jaesong Lee","raw_affiliation_strings":["NAVER Corporation"],"affiliations":[{"raw_affiliation_string":"NAVER Corporation","institution_ids":["https://openalex.org/I60922564"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001291873","display_name":"Shinji Watanabe","orcid":"https://orcid.org/0000-0002-5970-8631"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Carnegie Mellon University"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5033862822"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":4.3949,"has_fulltext":false,"cited_by_count":25,"citation_normalized_percentile":{"value":0.95483345,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8296017646789551},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7649151086807251},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7397757768630981},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6268101930618286},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.41363441944122314},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.34103041887283325}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8296017646789551},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7649151086807251},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7397757768630981},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6268101930618286},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.41363441944122314},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34103041887283325},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096662","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096662","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320307110","display_name":"Delta","ror":"https://ror.org/03g9c1e75"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1821462560","https://openalex.org/W2127141656","https://openalex.org/W2144499799","https://openalex.org/W2157331557","https://openalex.org/W2327501763","https://openalex.org/W2331143823","https://openalex.org/W2884751099","https://openalex.org/W2962780374","https://openalex.org/W2962944050","https://openalex.org/W2963393494","https://openalex.org/W2972818416","https://openalex.org/W3007328579","https://openalex.org/W3037913581","https://openalex.org/W3091900426","https://openalex.org/W3097777922","https://openalex.org/W3151287998","https://openalex.org/W3162249256","https://openalex.org/W3163105696","https://openalex.org/W3173563887","https://openalex.org/W3196783077","https://openalex.org/W3197148831","https://openalex.org/W3198368663","https://openalex.org/W3200669029","https://openalex.org/W3203140070","https://openalex.org/W3204647170","https://openalex.org/W4224821750","https://openalex.org/W4225302959","https://openalex.org/W4284881636","https://openalex.org/W4295116917","https://openalex.org/W4319862255","https://openalex.org/W6638523607","https://openalex.org/W6679434410","https://openalex.org/W6685943813","https://openalex.org/W6691770337","https://openalex.org/W6729448088","https://openalex.org/W6730091202","https://openalex.org/W6739901393","https://openalex.org/W6766978945","https://openalex.org/W6768080748","https://openalex.org/W6796551075","https://openalex.org/W6839026989"],"related_works":["https://openalex.org/W4390516098","https://openalex.org/W2181948922","https://openalex.org/W2384362569","https://openalex.org/W2055243143","https://openalex.org/W4205302943","https://openalex.org/W2119949815","https://openalex.org/W4206178588","https://openalex.org/W4287635093","https://openalex.org/W3094491777","https://openalex.org/W3214715529"],"abstract_inverted_index":{"Transformer-based":[0],"end-to-end":[1],"speech":[2],"recognition":[3],"has":[4,42],"achieved":[5],"great":[6],"success.":[7],"However,":[8],"the":[9,31,39,78,82,96,100],"large":[10],"footprint":[11],"and":[12,34,81,99],"computational":[13],"overhead":[14],"make":[15],"it":[16],"difficult":[17],"to":[18,61],"deploy":[19],"these":[20],"models":[21,76],"in":[22],"some":[23],"real-world":[24],"applications.":[25],"Model":[26],"compression":[27],"techniques":[28],"can":[29],"reduce":[30],"model":[32,41,85],"size":[33],"speed":[35],"up":[36],"inference,":[37],"but":[38],"compressed":[40],"a":[43,52,67],"fixed":[44],"architecture":[45],"which":[46,102],"might":[47],"be":[48],"suboptimal.":[49],"We":[50,90],"propose":[51],"novel":[53],"Transformer":[54,80],"encoder":[55],"with":[56],"Input-Dependent":[57],"Dynamic":[58],"Depth":[59],"(I3D)":[60],"achieve":[62],"strong":[63],"performance-efficiency":[64],"trade-offs.":[65],"With":[66],"similar":[68],"number":[69],"of":[70],"layers":[71],"at":[72],"inference":[73],"time,":[74],"I3D-based":[75],"outperform":[77],"vanilla":[79],"static":[83],"pruned":[84],"via":[86],"iterative":[87],"layer":[88],"pruning.":[89],"also":[91],"present":[92],"interesting":[93],"analysis":[94],"on":[95],"gate":[97],"probabilities":[98],"input-dependency,":[101],"helps":[103],"us":[104],"better":[105],"understand":[106],"deep":[107],"encoders.":[108]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":17},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":3}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
