{"id":"https://openalex.org/W4405355443","doi":"https://doi.org/10.1109/iccv51701.2025.00352","title":"LYRA: An Efficient and Speech-Centric Framework for Omni-Cognition","display_name":"LYRA: An Efficient and Speech-Centric Framework for Omni-Cognition","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4405355443","doi":"https://doi.org/10.1109/iccv51701.2025.00352"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.00352","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.00352","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.09501","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102709889","display_name":"Zhisheng Zhong","orcid":"https://orcid.org/0000-0003-2944-9680"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Zhisheng Zhong","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103090275","display_name":"Chengyao Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Chengyao Wang","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100400509","display_name":"Yuqi Liu","orcid":"https://orcid.org/0000-0002-1556-1930"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yuqi Liu","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066489766","display_name":"Senqiao Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Senqiao Yang","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010564305","display_name":"Longxiang Tang","orcid":"https://orcid.org/0009-0005-2704-3718"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Longxiang Tang","raw_affiliation_strings":["HKUST"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"HKUST","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009874821","display_name":"Yuechen Zhang","orcid":"https://orcid.org/0009-0000-9112-0216"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yuechen Zhang","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102759648","display_name":"Jingyao Li","orcid":"https://orcid.org/0009-0009-7413-6190"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Jingyao Li","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009847607","display_name":"Tan Qu","orcid":null},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Tianyuan Qu","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115588378","display_name":"Yanwei Li","orcid":"https://orcid.org/0009-0008-3939-6360"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yanwei Li","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060855982","display_name":"Yukang Chen","orcid":"https://orcid.org/0000-0002-5933-2654"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yukang Chen","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036441177","display_name":"Shuting Yu","orcid":"https://orcid.org/0000-0003-2855-3814"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Shaozuo Yu","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016573203","display_name":"Sitong Wu","orcid":"https://orcid.org/0000-0002-7984-784X"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Sitong Wu","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023779418","display_name":"Eric Siu-Chung Lo","orcid":"https://orcid.org/0000-0002-8789-8479"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Eric Lo","raw_affiliation_strings":["CUHK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CUHK","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091885017","display_name":"Shu Liu","orcid":"https://orcid.org/0000-0002-6072-6036"},"institutions":[{"id":"https://openalex.org/I4210127432","display_name":"SMART Reading","ror":"https://ror.org/0345xq683","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210127432"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shu Liu","raw_affiliation_strings":["SmartMore"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SmartMore","institution_ids":["https://openalex.org/I4210127432"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052856441","display_name":"Jiaya Jia","orcid":"https://orcid.org/0000-0002-1246-553X"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Jiaya Jia","raw_affiliation_strings":["HKUST"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"HKUST","institution_ids":["https://openalex.org/I200769079"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":15,"corresponding_author_ids":["https://openalex.org/A5102709889"],"corresponding_institution_ids":["https://openalex.org/I889458895"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.00077109,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"3694","last_page":"3704"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9740999937057495,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9740999937057495,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.6510485410690308},{"id":"https://openalex.org/keywords/socially-distributed-cognition","display_name":"Socially distributed cognition","score":0.43137505650520325},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4266808331012726},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.3539450764656067},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.29251086711883545},{"id":"https://openalex.org/keywords/neuroscience","display_name":"Neuroscience","score":0.06802639365196228}],"concepts":[{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.6510485410690308},{"id":"https://openalex.org/C17037162","wikidata":"https://www.wikidata.org/wiki/Q7551870","display_name":"Socially distributed cognition","level":3,"score":0.43137505650520325},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4266808331012726},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.3539450764656067},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.29251086711883545},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.06802639365196228}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.00352","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.00352","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2412.09501","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.09501","pdf_url":"https://arxiv.org/pdf/2412.09501","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.09501","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.09501","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.09501","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.09501","pdf_url":"https://arxiv.org/pdf/2412.09501","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321920","display_name":"Innovation and Technology Commission","ror":"https://ror.org/04vf9tr09"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4405355443.pdf","grobid_xml":"https://content.openalex.org/works/W4405355443.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2463045488","https://openalex.org/W2779186401","https://openalex.org/W1971199552","https://openalex.org/W2081348319","https://openalex.org/W1974622901","https://openalex.org/W4248151009","https://openalex.org/W4251641042","https://openalex.org/W3127991229","https://openalex.org/W2074627502","https://openalex.org/W2134632992"],"abstract_inverted_index":{"As":[0],"Multi-modal":[1],"Large":[2],"Language":[3],"Models":[4],"(MLLMs)":[5],"evolve,":[6],"expanding":[7],"beyond":[8],"single-domain":[9],"capabilities":[10],"is":[11],"essential":[12],"to":[13,78,93,129,141],"meet":[14],"the":[15,95],"demands":[16],"for":[17],"more":[18,137],"versatile":[19],"and":[20,53,60,73,82,91,99,106,122,135,152,161],"efficient":[21,39],"AI.":[22],"However,":[23],"previous":[24],"omni-models":[25],"have":[26],"insufficiently":[27],"explored":[28],"speech,":[29],"neglecting":[30],"its":[31],"integration":[32],"with":[33],"multi-modality.":[34],"We":[35],"introduce":[36],"Lyra,":[37],"an":[38],"MLLM":[40],"that":[41,113],"enhances":[42],"multimodal":[43],"abilities,":[44],"including":[45],"advanced":[46],"long-speech":[47],"comprehension,":[48],"sound":[49],"understanding,":[50],"cross-modality":[51],"efficiency,":[52],"seamless":[54],"speech":[55,98,125,133],"interaction.":[56],"To":[57],"achieve":[58,136],"efficiency":[59],"speech-centric":[61],"capabilities,":[62],"Lyra":[63,128,144],"employs":[64],"three":[65],"strategies:":[66],"(1)":[67],"leveraging":[68],"existing":[69],"open-source":[70],"large":[71],"models":[72],"a":[74,87,109],"proposed":[75],"multi-modality":[76,89],"LoRA":[77],"reduce":[79],"training":[80,163],"costs":[81],"data":[83,120],"requirements;":[84],"(2)":[85],"using":[86,157],"latent":[88],"regularizer":[90],"extractor":[92],"strengthen":[94],"relationship":[96],"between":[97],"other":[100,142],"modalities,":[101],"thereby":[102],"enhancing":[103],"model":[104],"performance;":[105],"(3)":[107],"constructing":[108],"high-quality,":[110],"extensive":[111],"dataset":[112],"includes":[114],"1.5M":[115],"multi-modal":[116],"(language,":[117],"vision,":[118],"audio)":[119],"samples":[121],"12K":[123],"long":[124,132],"samples,":[126],"enabling":[127],"handle":[130],"complex":[131],"inputs":[134],"robust":[138],"omni-cognition.":[139],"Compared":[140],"omni-methods,":[143],"achieves":[145],"state-of-the-art":[146],"performance":[147],"on":[148],"various":[149],"vision-language,":[150],"vision-speech,":[151],"speech-language":[153],"benchmarks,":[154],"while":[155],"also":[156],"fewer":[158],"computational":[159],"resources":[160],"less":[162],"data.":[164]},"counts_by_year":[],"updated_date":"2026-05-06T06:03:25.996018","created_date":"2025-10-10T00:00:00"}
