{"id":"https://openalex.org/W7131400360","doi":"https://doi.org/10.48550/arxiv.2602.19409","title":"AuditoryHuM: Auditory Scene Label Generation and Clustering using Human-MLLM Collaboration","display_name":"AuditoryHuM: Auditory Scene Label Generation and Clustering using Human-MLLM Collaboration","publication_year":2026,"publication_date":"2026-02-23","ids":{"openalex":"https://openalex.org/W7131400360","doi":"https://doi.org/10.48550/arxiv.2602.19409"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.19409","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19409","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.19409","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5050418485","display_name":"Henry Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhong, Henry","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030827600","display_name":"J\u00f6rg M. Buchholz","orcid":"https://orcid.org/0000-0001-6188-9761"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Buchholz, J\u00f6rg M.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126745149","display_name":"Julian Maclaren","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Maclaren, Julian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013630753","display_name":"Simon Carlile","orcid":"https://orcid.org/0000-0001-6952-7976"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Carlile, Simon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5026382065","display_name":"Richard F. Lyon","orcid":"https://orcid.org/0000-0003-2348-811X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyon, Richard F.","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5050418485"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.7376999855041504,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.7376999855041504,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.03530000150203705,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.016699999570846558,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.7111999988555908},{"id":"https://openalex.org/keywords/silhouette","display_name":"Silhouette","score":0.5534999966621399},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.44600000977516174},{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.4099999964237213},{"id":"https://openalex.org/keywords/cohesion","display_name":"Cohesion (chemistry)","score":0.4083000123500824},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.39980000257492065},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.3898000121116638},{"id":"https://openalex.org/keywords/enhanced-data-rates-for-gsm-evolution","display_name":"Enhanced Data Rates for GSM Evolution","score":0.3490000069141388},{"id":"https://openalex.org/keywords/spectral-clustering","display_name":"Spectral clustering","score":0.3427000045776367}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7760999798774719},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.7111999988555908},{"id":"https://openalex.org/C58103923","wikidata":"https://www.wikidata.org/wiki/Q2286025","display_name":"Silhouette","level":2,"score":0.5534999966621399},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5268999934196472},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.44600000977516174},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4147000014781952},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.4099999964237213},{"id":"https://openalex.org/C104054115","wikidata":"https://www.wikidata.org/wiki/Q216828","display_name":"Cohesion (chemistry)","level":2,"score":0.4083000123500824},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.39980000257492065},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.3898000121116638},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.3490000069141388},{"id":"https://openalex.org/C105611402","wikidata":"https://www.wikidata.org/wiki/Q2976589","display_name":"Spectral clustering","level":3,"score":0.3427000045776367},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.3409000039100647},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3395000100135803},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.33550000190734863},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.3222000002861023},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.30979999899864197},{"id":"https://openalex.org/C93692415","wikidata":"https://www.wikidata.org/wiki/Q1502030","display_name":"Thematic map","level":2,"score":0.3050000071525574},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3050000071525574},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.3012999892234802},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2759000062942505},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.2637999951839447},{"id":"https://openalex.org/C121687571","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Activity recognition","level":2,"score":0.26030001044273376},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2563999891281128},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.25369998812675476},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.19409","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19409","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.19409","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19409","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8","score":0.5821483135223389}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Manual":[0],"annotation":[1],"of":[2,31,151],"audio":[3,57,82],"datasets":[4,130],"is":[5,10,89],"labour":[6],"intensive,":[7],"and":[8,29,48,63,80,121,133,164,171],"it":[9],"challenging":[11],"to":[12,72,92,117,157],"balance":[13,118],"label":[14,61],"granularity":[15],"with":[16],"acoustic":[17],"separability.":[18],"We":[19],"introduce":[20],"AuditoryHuM,":[21],"a":[22,36,114,138],"novel":[23],"framework":[24,51],"for":[25,56,142],"the":[26,50,74,94,149],"unsupervised":[27],"discovery":[28],"clustering":[30],"auditory":[32,128],"scene":[33,129,153],"labels":[34,55,79,100],"using":[35,107],"collaborative":[37],"Human-Multimodal":[38],"Large":[39],"Language":[40],"Model":[41],"(MLLM)":[42],"approach.":[43],"By":[44],"leveraging":[45],"MLLMs":[46],"(Gemma":[47],"Qwen)":[49],"generates":[52],"contextually":[53],"relevant":[54],"data.":[58],"To":[59],"ensure":[60],"quality":[62],"mitigate":[64],"hallucinations,":[65],"we":[66],"employ":[67],"zero-shot":[68],"learning":[69],"techniques":[70],"(Human-CLAP)":[71],"quantify":[73],"alignment":[75],"between":[76],"generated":[77],"text":[78],"raw":[81],"content.":[83],"A":[84],"strategically":[85],"targeted":[86],"human-in-the-loop":[87],"intervention":[88],"then":[90],"used":[91],"refine":[93],"least":[95],"aligned":[96],"pairs.":[97],"The":[98,168],"discovered":[99],"are":[101],"grouped":[102],"into":[103],"thematically":[104],"cohesive":[105],"clusters":[106],"an":[108],"adjusted":[109],"silhouette":[110],"score":[111],"that":[112],"incorporates":[113],"penalty":[115],"parameter":[116],"cluster":[119],"cohesion":[120],"thematic":[122],"granularity.":[123],"Evaluated":[124],"across":[125],"three":[126],"diverse":[127],"(ADVANCE,":[131],"AHEAD-DS,":[132],"TAU":[134],"2019),":[135],"AuditoryHuM":[136],"provides":[137],"scalable,":[139],"low-cost":[140],"solution":[141,147],"creating":[143],"standardised":[144],"taxonomies.":[145],"This":[146],"facilitates":[148],"training":[150],"lightweight":[152],"recognition":[154],"models":[155],"deployable":[156],"edge":[158],"devices,":[159],"such":[160],"as":[161],"hearing":[162],"aids":[163],"smart":[165],"home":[166],"assistants.":[167],"project":[169],"page":[170],"code:":[172],"https://github.com/Australian-Future-Hearing-Initiative":[173]},"counts_by_year":[],"updated_date":"2026-02-26T06:34:08.959763","created_date":"2026-02-26T00:00:00"}
