{"id":"https://openalex.org/W3034892239","doi":"https://doi.org/10.1109/taslp.2020.3001969","title":"Learning Hierarchy Aware Embedding from Raw Audio for Acoustic Scene Classification","display_name":"Learning Hierarchy Aware Embedding from Raw Audio for Acoustic Scene Classification","publication_year":2020,"publication_date":"2020-01-01","ids":{"openalex":"https://openalex.org/W3034892239","doi":"https://doi.org/10.1109/taslp.2020.3001969","mag":"3034892239"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2020.3001969","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2020.3001969","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5075520691","display_name":"Vinayak Abrol","orcid":"https://orcid.org/0000-0001-8149-8151"},"institutions":[{"id":"https://openalex.org/I40120149","display_name":"University of Oxford","ror":"https://ror.org/052gg0110","country_code":"GB","type":"education","lineage":["https://openalex.org/I40120149"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Vinayak Abrol","raw_affiliation_strings":["Mathematical Institute, University of Oxford, Oxford, U.K"],"affiliations":[{"raw_affiliation_string":"Mathematical Institute, University of Oxford, Oxford, U.K","institution_ids":["https://openalex.org/I40120149"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101783272","display_name":"Pulkit Sharma","orcid":"https://orcid.org/0000-0001-7870-7098"},"institutions":[{"id":"https://openalex.org/I40120149","display_name":"University of Oxford","ror":"https://ror.org/052gg0110","country_code":"GB","type":"education","lineage":["https://openalex.org/I40120149"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Pulkit Sharma","raw_affiliation_strings":["Department of Engineering Science, University of Oxford, Oxford, U.K"],"affiliations":[{"raw_affiliation_string":"Department of Engineering Science, University of Oxford, Oxford, U.K","institution_ids":["https://openalex.org/I40120149"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5075520691"],"corresponding_institution_ids":["https://openalex.org/I40120149"],"apc_list":null,"apc_paid":null,"fwci":2.8906,"has_fulltext":false,"cited_by_count":28,"citation_normalized_percentile":{"value":0.91671011,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"1"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9936000108718872,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.808700680732727},{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.56670081615448},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5358392596244812},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.5268327593803406},{"id":"https://openalex.org/keywords/categorical-variable","display_name":"Categorical variable","score":0.5180811882019043},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5066500306129456},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.47136369347572327},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.4662116467952728},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4656665325164795},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.4534112215042114},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.45095789432525635},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4275173842906952},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3422520160675049},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.20801126956939697},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.10410270094871521}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.808700680732727},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.56670081615448},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5358392596244812},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.5268327593803406},{"id":"https://openalex.org/C5274069","wikidata":"https://www.wikidata.org/wiki/Q2285707","display_name":"Categorical variable","level":2,"score":0.5180811882019043},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5066500306129456},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.47136369347572327},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.4662116467952728},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4656665325164795},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.4534112215042114},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.45095789432525635},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4275173842906952},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3422520160675049},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.20801126956939697},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.10410270094871521}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/taslp.2020.3001969","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2020.3001969","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:ora.ox.ac.uk:uuid:4542e285-abf2-44cd-8380-badfecd9b62d","is_oa":false,"landing_page_url":"https://ora.ox.ac.uk/objects/uuid:4542e285-abf2-44cd-8380-badfecd9b62d","pdf_url":null,"source":{"id":"https://openalex.org/S4306402636","display_name":"Oxford University Research Archive (ORA) (University of Oxford)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I40120149","host_organization_name":"University of Oxford","host_organization_lineage":["https://openalex.org/I40120149"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Symplectic Elements","raw_type":"Journal article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":81,"referenced_works":["https://openalex.org/W172871939","https://openalex.org/W1487641199","https://openalex.org/W1666984270","https://openalex.org/W2006546815","https://openalex.org/W2052666245","https://openalex.org/W2095147901","https://openalex.org/W2103235956","https://openalex.org/W2137343183","https://openalex.org/W2157364932","https://openalex.org/W2200864288","https://openalex.org/W2242773987","https://openalex.org/W2398826216","https://openalex.org/W2399733683","https://openalex.org/W2401869809","https://openalex.org/W2403172465","https://openalex.org/W2408239454","https://openalex.org/W2437181147","https://openalex.org/W2472122037","https://openalex.org/W2509065397","https://openalex.org/W2511956680","https://openalex.org/W2513345070","https://openalex.org/W2526050071","https://openalex.org/W2529943390","https://openalex.org/W2552194003","https://openalex.org/W2591013610","https://openalex.org/W2592641653","https://openalex.org/W2592944988","https://openalex.org/W2593116425","https://openalex.org/W2600537992","https://openalex.org/W2602489101","https://openalex.org/W2605372163","https://openalex.org/W2609575245","https://openalex.org/W2617512665","https://openalex.org/W2619623002","https://openalex.org/W2738359832","https://openalex.org/W2752941683","https://openalex.org/W2758739367","https://openalex.org/W2770454110","https://openalex.org/W2777662428","https://openalex.org/W2791956393","https://openalex.org/W2885329609","https://openalex.org/W2888839755","https://openalex.org/W2950928354","https://openalex.org/W2962845248","https://openalex.org/W2962933129","https://openalex.org/W2962959915","https://openalex.org/W2963103976","https://openalex.org/W2963175699","https://openalex.org/W2963281697","https://openalex.org/W2963859210","https://openalex.org/W2964065616","https://openalex.org/W2964135650","https://openalex.org/W2964218314","https://openalex.org/W2964345931","https://openalex.org/W2972717745","https://openalex.org/W3040915569","https://openalex.org/W3098357269","https://openalex.org/W3099869713","https://openalex.org/W3101227480","https://openalex.org/W4293478066","https://openalex.org/W4297665915","https://openalex.org/W6607062892","https://openalex.org/W6629354409","https://openalex.org/W6637061625","https://openalex.org/W6678409544","https://openalex.org/W6690394000","https://openalex.org/W6712560600","https://openalex.org/W6713197546","https://openalex.org/W6731140762","https://openalex.org/W6735369481","https://openalex.org/W6736583452","https://openalex.org/W6744617256","https://openalex.org/W6745983287","https://openalex.org/W6747381837","https://openalex.org/W6749158954","https://openalex.org/W6751813273","https://openalex.org/W6752356114","https://openalex.org/W6842019321","https://openalex.org/W6891715281","https://openalex.org/W7064683377","https://openalex.org/W7072093103"],"related_works":["https://openalex.org/W2517027266","https://openalex.org/W2424871898","https://openalex.org/W4225852842","https://openalex.org/W2291847203","https://openalex.org/W3004532561","https://openalex.org/W4287776258","https://openalex.org/W2756241593","https://openalex.org/W3027997911","https://openalex.org/W2767651786","https://openalex.org/W2944724518"],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,119,202],"modeling":[3],"speech":[4],"and":[5,19,104,174,264,272],"audio":[6,169,184],"signals":[7],"using":[8,51,66,76],"deep":[9],"neural":[10,68],"networks":[11],"have":[12],"shown":[13],"that":[14,107,137],"systems":[15,35,49],"learning":[16,251,278],"both":[17],"features":[18],"the":[20,30,73,83,91,120,126,138,147,167,198,212,240,243,254,267],"classifier":[21],"can":[22],"be":[23],"built":[24,50],"directly":[25],"from":[26],"raw":[27,60],"signal.":[28,170],"However,":[29],"performance":[31],"of":[32,112,146,166,204,242,269],"such":[33],"end-to-end":[34,63],"for":[36,225,246,279],"acoustic":[37,87,209,228],"scene":[38],"classification":[39,92],"(ASC)":[40],"task":[41],"is":[42,100,188],"still":[43],"not":[44],"at":[45,141],"par":[46],"with":[47,102],"conventional":[48],"spectral":[52],"features.":[53],"In":[54,70,114],"this":[55,95],"work,":[56],"we":[57],"propose":[58],"a":[59,77,130,134,155,182],"waveform":[61],"based":[62],"ASC":[64,247,280],"system":[65],"convolutional":[67],"network.":[69],"contrast":[71],"to":[72,89,109,116,144,159,180,196,221,238,259],"existing":[74],"studies":[75],"non-hierarchical":[78],"model,":[79],"our":[80,97],"framework":[81,128,214,245],"leverages":[82],"hierarchical":[84],"relations":[85],"between":[86],"categories":[88],"improve":[90],"performance.":[93],"To":[94],"aim,":[96],"multi-task":[98],"model":[99,135],"trained":[101],"coarse":[103],"fine":[105],"labels":[106],"correspond":[108],"different":[110,223],"levels":[111],"abstraction.":[113],"order":[115],"ensure":[117],"consistency":[118],"encoded":[121],"information":[122],"via":[123,190],"label":[124],"hierarchy,":[125],"proposed":[127,213,244,255],"uses":[129],"prototypical":[131],"model.":[132],"Such":[133],"ensures":[136],"learned":[139,150],"representations":[140,162],"least":[142],"match":[143],"one":[145],"global":[148],"categorical":[149],"prototypes.":[151],"We":[152],"also":[153],"employed":[154],"statistical":[156],"pooling":[157],"layer":[158],"aggregate":[160],"hidden":[161],"over":[163],"multiple":[164],"frames":[165],"input":[168],"The":[171],"statistics":[172],"(mean":[173],"standard":[175],"deviation)":[176],"are":[177,236],"concatenated":[178],"together":[179],"form":[181],"fixed-length":[183],"embedding.":[185],"This":[186],"aggregation":[187],"done":[189],"an":[191],"attention":[192,200],"module":[193],"so":[194],"as":[195],"guide":[197],"model's":[199,256],"even":[201],"presence":[203],"relatively":[205],"short":[206],"or":[207],"transient":[208],"events.":[210],"Further,":[211],"incorporate":[215],"two":[216],"parallel":[217],"feature":[218],"processing":[219],"pipelines":[220],"achieve":[222],"resolutions":[224],"extracting":[226],"important":[227],"cues.":[229],"Various":[230],"experiments":[231,252],"on":[232,275],"publicly":[233],"available":[234],"datasets":[235],"performed":[237],"demonstrate":[239,266],"effectiveness":[241],"task.":[248,281],"Additional":[249],"transfer":[250],"showed":[253],"adaptation":[257],"capability":[258],"unseen":[260],"data.":[261],"Network":[262],"analysis":[263],"visualizations":[265],"importance":[268],"individual":[270],"modules":[271],"their":[273],"impact":[274],"overall":[276],"representation":[277]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":8},{"year":2021,"cited_by_count":8}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
