{"id":"https://openalex.org/W4283834483","doi":"https://doi.org/10.21437/interspeech.2022-592","title":"M-Adapter: Modality Adaptation for End-to-End Speech-to-Text Translation","display_name":"M-Adapter: Modality Adaptation for End-to-End Speech-to-Text Translation","publication_year":2022,"publication_date":"2022-09-16","ids":{"openalex":"https://openalex.org/W4283834483","doi":"https://doi.org/10.21437/interspeech.2022-592"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2022-592","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-592","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://research.monash.edu/en/publications/4557acf2-584a-4b93-b282-0412afe22acd","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100531552","display_name":"Jinming Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Jinming Zhao","raw_affiliation_strings":["Department of Data Science & AI, Monash University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Data Science & AI, Monash University","institution_ids":["https://openalex.org/I56590836"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Hao Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Hao Yang","raw_affiliation_strings":["Department of Data Science & AI, Monash University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Data Science & AI, Monash University","institution_ids":["https://openalex.org/I56590836"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081525024","display_name":"Gholamreza Haffari","orcid":"https://orcid.org/0000-0001-7326-8380"},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Gholamreza Haffari","raw_affiliation_strings":["Department of Data Science & AI, Monash University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Data Science & AI, Monash University","institution_ids":["https://openalex.org/I56590836"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086032589","display_name":"Ehsan Shareghi","orcid":null},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Ehsan Shareghi","raw_affiliation_strings":["Department of Data Science & AI, Monash University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Data Science & AI, Monash University","institution_ids":["https://openalex.org/I56590836"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.142,"has_fulltext":true,"cited_by_count":11,"citation_normalized_percentile":{"value":0.7980917,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"111","last_page":"115"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9592999815940857,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9592999815940857,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9124000072479248,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9117000102996826,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8133736848831177},{"id":"https://openalex.org/keywords/adapter","display_name":"Adapter (computing)","score":0.7847734689712524},{"id":"https://openalex.org/keywords/speech-translation","display_name":"Speech translation","score":0.733634352684021},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7275781035423279},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6562939286231995},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5882027745246887},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.5198019742965698},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.501441240310669},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.4801637530326843},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4299164116382599},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4204506278038025},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4156327247619629},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.11623227596282959},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.06994488835334778}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8133736848831177},{"id":"https://openalex.org/C177284502","wikidata":"https://www.wikidata.org/wiki/Q1005390","display_name":"Adapter (computing)","level":2,"score":0.7847734689712524},{"id":"https://openalex.org/C2780366754","wikidata":"https://www.wikidata.org/wiki/Q7494857","display_name":"Speech translation","level":3,"score":0.733634352684021},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7275781035423279},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6562939286231995},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5882027745246887},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.5198019742965698},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.501441240310669},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.4801637530326843},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4299164116382599},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4204506278038025},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4156327247619629},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.11623227596282959},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.06994488835334778},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.21437/interspeech.2022-592","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-592","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},{"id":"pmh:oai:monash.edu:openaire/4557acf2-584a-4b93-b282-0412afe22acd","is_oa":true,"landing_page_url":"https://research.monash.edu/en/publications/4557acf2-584a-4b93-b282-0412afe22acd","pdf_url":null,"source":{"id":"https://openalex.org/S4306402625","display_name":"Monash University Research Portal (Monash University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I56590836","host_organization_name":"Monash University","host_organization_lineage":["https://openalex.org/I56590836"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Zhao, J, Yang, H, Shareghi, E & Haffari, G 2022, M-Adapter : modality adaptation for end-to-end speech-to-text translation. in K Lee, L Lamel, M Hasegawa-Johnson, K Livescu & O Kang (eds), Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH 2022. International Speech Communication Association (ISCA), France, pp. 111-115, Annual Conference of the International Speech Communication Association (was Eurospeech) 2022, Incheon, Korea, South, 18/09/22. https://doi.org/10.21437/Interspeech.2022-592","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:monash.edu:publications/4557acf2-584a-4b93-b282-0412afe22acd","is_oa":true,"landing_page_url":"http://www.scopus.com/inward/record.url?scp=85140091189&partnerID=8YFLogxK","pdf_url":null,"source":{"id":"https://openalex.org/S4306402625","display_name":"Monash University Research Portal (Monash University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I56590836","host_organization_name":"Monash University","host_organization_lineage":["https://openalex.org/I56590836"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Zhao , J , Yang , H , Shareghi , E &amp; Haffari , G 2022 , M-Adapter : modality adaptation for end-to-end speech-to-text translation . in K Lee , L Lamel , M Hasegawa-Johnson , K Livescu &amp; O Kang (eds) , Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH 2022 . International Speech Communication Association (ISCA) , France , pp. 111-115 , Annual Conference of the International Speech Communication Association (was Eurospeech) 2022 , Incheon , Korea, South , 18/09/22 . https://doi.org/10.21437/Interspeech.2022-592","raw_type":"contributionToPeriodical"}],"best_oa_location":{"id":"pmh:oai:monash.edu:openaire/4557acf2-584a-4b93-b282-0412afe22acd","is_oa":true,"landing_page_url":"https://research.monash.edu/en/publications/4557acf2-584a-4b93-b282-0412afe22acd","pdf_url":null,"source":{"id":"https://openalex.org/S4306402625","display_name":"Monash University Research Portal (Monash University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I56590836","host_organization_name":"Monash University","host_organization_lineage":["https://openalex.org/I56590836"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Zhao, J, Yang, H, Shareghi, E & Haffari, G 2022, M-Adapter : modality adaptation for end-to-end speech-to-text translation. in K Lee, L Lamel, M Hasegawa-Johnson, K Livescu & O Kang (eds), Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH 2022. International Speech Communication Association (ISCA), France, pp. 111-115, Annual Conference of the International Speech Communication Association (was Eurospeech) 2022, Incheon, Korea, South, 18/09/22. https://doi.org/10.21437/Interspeech.2022-592","raw_type":"info:eu-repo/semantics/publishedVersion"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.4099999964237213}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2079722985","https://openalex.org/W2605131327","https://openalex.org/W2951974815","https://openalex.org/W2997436923","https://openalex.org/W3015213852","https://openalex.org/W3035490255","https://openalex.org/W3036601975","https://openalex.org/W3092085609","https://openalex.org/W3092424727","https://openalex.org/W3097777922","https://openalex.org/W3105669983","https://openalex.org/W3107826490","https://openalex.org/W3112034174","https://openalex.org/W3125709657","https://openalex.org/W3153583341","https://openalex.org/W3169483174","https://openalex.org/W3173767661","https://openalex.org/W3176382501","https://openalex.org/W3176455679","https://openalex.org/W3176711365","https://openalex.org/W3183148055","https://openalex.org/W3196833881","https://openalex.org/W3198526264","https://openalex.org/W3200578235","https://openalex.org/W4226380987","https://openalex.org/W4287329822","https://openalex.org/W4287629556","https://openalex.org/W4297808394","https://openalex.org/W4385245566"],"related_works":["https://openalex.org/W2990025607","https://openalex.org/W3045103338","https://openalex.org/W2338806053","https://openalex.org/W3007142233","https://openalex.org/W3037186962","https://openalex.org/W4385571610","https://openalex.org/W3177132412","https://openalex.org/W3198731777","https://openalex.org/W2293738010","https://openalex.org/W4391147652"],"abstract_inverted_index":{"End-to-end":[0],"speech-to-text":[1,85],"translation":[2,60,86],"models":[3],"are":[4],"often":[5],"initialized":[6],"with":[7],"pre-trained":[8,12],"speech":[9,33,55,71,78,95],"encoder":[10,37],"and":[11,24,38,56,90],"text":[13,39,57],"decoder.":[14,43],"This":[15],"leads":[16],"to":[17,28,41,49,58,69,73,110],"a":[18,65,94,105],"significant":[19],"training":[20],"gap":[21,53],"between":[22,32,54],"pretraining":[23],"fine-tuning,":[25],"largely":[26],"due":[27],"the":[29,36,42,51,77,115],"modality":[30,52],"differences":[31],"outputs":[34],"from":[35],"inputs":[40],"In":[44],"this":[45],"work,":[46],"we":[47],"aim":[48],"bridge":[50],"improve":[59],"quality.":[61],"We":[62],"propose":[63],"M-Adapter,":[64],"novel":[66],"Transformer-based":[67],"module,":[68],"adapt":[70],"representations":[72],"text.":[74],"While":[75],"shrinking":[76],"sequence,":[79],"M-Adapter":[80],"produces":[81],"features":[82],"desired":[83],"for":[84],"via":[87],"modelling":[88],"global":[89],"local":[91],"dependencies":[92],"of":[93],"sequence.":[96],"Our":[97],"experimental":[98],"results":[99],"show":[100],"that":[101],"our":[102],"model":[103],"outperforms":[104],"strong":[106],"baseline":[107],"by":[108],"up":[109],"1":[111],"BLEU":[112],"score":[113],"on":[114],"Must-C":[116],"En\u2192DE":[117],"dataset.":[118]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
