{"id":"https://openalex.org/W7162656786","doi":"https://doi.org/10.48550/arxiv.2605.27976","title":"VoiceGiraffe: A Benchmark for Extreme Long-Context Audio-Language Understanding","display_name":"VoiceGiraffe: A Benchmark for Extreme Long-Context Audio-Language Understanding","publication_year":2026,"publication_date":"2026-05-27","ids":{"openalex":"https://openalex.org/W7162656786","doi":"https://doi.org/10.48550/arxiv.2605.27976"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.27976","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27976","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.27976","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137256450","display_name":"Jashin Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Jashin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100609440","display_name":"Dongxiao Wang","orcid":"https://orcid.org/0000-0001-8778-2188"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Dongxiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137255160","display_name":"Yixuan Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Yixuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060861535","display_name":"S. Zhou","orcid":"https://orcid.org/0000-0002-7517-9653"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Sashuai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137292758","display_name":"Weihuang Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Weihuang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070378536","display_name":"Mingyang Han","orcid":"https://orcid.org/0000-0001-6969-1573"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Mingyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100676186","display_name":"Kunpeng Wang","orcid":"https://orcid.org/0000-0002-9412-7867"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Kunpeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024431324","display_name":"Zeyu Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Zeyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137251104","display_name":"Boyu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Boyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137236934","display_name":"Haoxiang Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Haoxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137207528","display_name":"Jingchen Shu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shu, Jingchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102010684","display_name":"Jun Song","orcid":"https://orcid.org/0000-0003-0659-1158"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Jun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137202815","display_name":"Bo Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Bo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.6324999928474426,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.6324999928474426,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10788","display_name":"Neuroscience and Music Perception","score":0.05730000138282776,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.025499999523162842,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6624000072479248},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.6150000095367432},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5730000138282776},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5253999829292297},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.47609999775886536},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.45080000162124634},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.3919000029563904}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7821999788284302},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6624000072479248},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.6150000095367432},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5730000138282776},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5253999829292297},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.47609999775886536},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.45080000162124634},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4325999915599823},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.3919000029563904},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3522000014781952},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3495999872684479},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.33799999952316284},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.3052000105381012},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.30320000648498535},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.29409998655319214},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.27639999985694885},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.265500009059906},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.25780001282691956}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.27976","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27976","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.27976","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27976","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"score":0.5140735507011414,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"large":[1],"audio":[2,11,21,142,220],"language":[3],"models":[4,138,149,162],"(LALMs)":[5],"have":[6],"achieved":[7],"remarkable":[8],"progress":[9],"in":[10,47],"processing":[12],"at":[13,182],"the":[14,205,223],"second-":[15],"or":[16,33],"minute-level":[17],"scale,":[18],"understanding":[19],"hour-level":[20],"remains":[22,117],"a":[23,63,88,99,176,213],"fundamental":[24,113],"bottleneck.":[25,178],"Existing":[26],"benchmarks":[27],"predominantly":[28],"rely":[29],"on":[30],"short":[31],"clips":[32],"artificially":[34],"concatenated":[35],"segments,":[36],"failing":[37],"to":[38,67],"faithfully":[39],"assess":[40],"LALM":[41],"capacity":[42],"for":[43,218,225],"long-range":[44,172,232],"information":[45],"comprehension":[46],"real-world":[48,73],"scenarios":[49],"such":[50],"as":[51,175,212],"podcasts":[52],"and":[53,76,94,104,120,154,215,230],"lengthy":[54],"speeches.":[55],"To":[56],"address":[57],"this":[58],"gap,":[59],"we":[60,125,170],"introduce":[61],"VoiceGiraffe,":[62],"novel":[64],"benchmark":[65],"designed":[66],"rigorously":[68],"evaluate":[69,98],"LALMs":[70,106,179,226],"across":[71,199],"diverse":[72],"scenarios,":[74],"modalities,":[75],"languages":[77],"under":[78],"long-context":[79,141],"settings.":[80],"It":[81],"comprises":[82],"1500":[83],"curated":[84],"triplets":[85],"structured":[86],"into":[87],"dual-level":[89],"taxonomy":[90],"of":[91,102,196],"single-hop":[92],"perception":[93],"multi-hop":[95],"reasoning.":[96],"We":[97],"broad":[100],"suite":[101],"open-source":[103],"proprietary":[105,167],"against":[107],"human":[108],"performance.":[109],"Results":[110],"underscore":[111],"three":[112],"findings.":[114],"First,":[115],"VoiceGiraffe":[116,211],"highly":[118],"challenging":[119,214],"far":[121],"from":[122],"saturation.":[123],"Second,":[124],"show":[126,204],"that":[127,185],"no":[128],"single":[129],"inference":[130,136],"paradigm":[131],"universally":[132],"dominates.":[133],"The":[134],"E2E":[135],"benefits":[137],"with":[139,157,227],"native":[140],"understanding,":[143,221],"cascaded":[144],"caption":[145],"aggregation":[146],"stabilizes":[147],"small":[148],"overwhelmed":[150],"by":[151],"hour-scale":[152],"audio,":[153,201],"reasoning-enhanced":[155],"cascading":[156],"external":[158],"LLM":[159],"helps":[160],"weaker":[161],"but":[163],"can":[164],"bottleneck":[165],"stronger":[166],"systems.":[168],"Third,":[169],"reveal":[171],"memory":[173,229],"persistence":[174],"key":[177],"are":[180],"better":[181],"answering":[183],"questions":[184],"require":[186],"connecting":[187],"salient":[188],"causal":[189],"cues":[190],"than":[191],"those":[192],"requiring":[193],"sustained":[194],"tracking":[195],"sparse":[197],"events":[198],"long":[200],"whereas":[202],"humans":[203],"opposite":[206],"pattern.":[207],"These":[208],"findings":[209],"position":[210],"diagnostic":[216],"testbed":[217],"long-form":[219],"highlighting":[222],"need":[224],"persistent":[228],"robust":[231],"aggregation.":[233]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-29T00:00:00"}
