{"id":"https://openalex.org/W2982030720","doi":"https://doi.org/10.1145/3343031.3351148","title":"Towards a Perceptual Loss","display_name":"Towards a Perceptual Loss","publication_year":2019,"publication_date":"2019-10-15","ids":{"openalex":"https://openalex.org/W2982030720","doi":"https://doi.org/10.1145/3343031.3351148","mag":"2982030720"},"language":"en","primary_location":{"id":"doi:10.1145/3343031.3351148","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3343031.3351148","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3343031.3351148","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 27th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3343031.3351148","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5037474835","display_name":"Ishwarya Ananthabhotla","orcid":null},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ishwarya Ananthabhotla","raw_affiliation_strings":["Massachusetts Institute of Technology, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001413512","display_name":"Sebastian Ewert","orcid":"https://orcid.org/0000-0002-0718-0476"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sebastian Ewert","raw_affiliation_strings":["Spotify, Inc., London, United Kingdom"],"affiliations":[{"raw_affiliation_string":"Spotify, Inc., London, United Kingdom","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086159548","display_name":"Joseph A. Paradiso","orcid":"https://orcid.org/0000-0002-0719-7104"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Joseph A. Paradiso","raw_affiliation_strings":["Massachusetts Institute of Technology, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5037474835"],"corresponding_institution_ids":["https://openalex.org/I63966007"],"apc_list":null,"apc_paid":null,"fwci":1.1729,"has_fulltext":true,"cited_by_count":9,"citation_normalized_percentile":{"value":0.7969856,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1518","last_page":"1525"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11008","display_name":"Aerodynamics and Acoustics in Jet Flows","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7935976982116699},{"id":"https://openalex.org/keywords/psychoacoustics","display_name":"Psychoacoustics","score":0.6638685464859009},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.5363326072692871},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5248700976371765},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.479388952255249},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4793039858341217},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4776495695114136},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4598376154899597},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.45564398169517517},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.35987234115600586}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7935976982116699},{"id":"https://openalex.org/C9940772","wikidata":"https://www.wikidata.org/wiki/Q557399","display_name":"Psychoacoustics","level":3,"score":0.6638685464859009},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.5363326072692871},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5248700976371765},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.479388952255249},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4793039858341217},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4776495695114136},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4598376154899597},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.45564398169517517},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35987234115600586},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3343031.3351148","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3343031.3351148","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3343031.3351148","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 27th ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:dspace.mit.edu:1721.1/137115","is_oa":true,"landing_page_url":"https://hdl.handle.net/1721.1/137115","pdf_url":null,"source":{"id":"https://openalex.org/S4306400425","display_name":"DSpace@MIT (Massachusetts Institute of Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I63966007","host_organization_name":"Massachusetts Institute of Technology","host_organization_lineage":["https://openalex.org/I63966007"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"MIT web domain","raw_type":"Article"},{"id":"pmh:oai:dspace.mit.edu:1721.1/137115.2","is_oa":true,"landing_page_url":"https://hdl.handle.net/1721.1/137115.2","pdf_url":null,"source":{"id":"https://openalex.org/S4306400425","display_name":"DSpace@MIT (Massachusetts Institute of Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I63966007","host_organization_name":"Massachusetts Institute of Technology","host_organization_lineage":["https://openalex.org/I63966007"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"MIT web domain","raw_type":"http://purl.org/eprint/type/ConferencePaper"}],"best_oa_location":{"id":"doi:10.1145/3343031.3351148","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3343031.3351148","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3343031.3351148","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 27th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.5899999737739563,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2982030720.pdf","grobid_xml":"https://content.openalex.org/works/W2982030720.grobid-xml"},"referenced_works_count":13,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W1606487971","https://openalex.org/W1970308149","https://openalex.org/W2331128040","https://openalex.org/W2529497146","https://openalex.org/W2891607145","https://openalex.org/W2892129657","https://openalex.org/W2919362165","https://openalex.org/W2950299304","https://openalex.org/W2963174698","https://openalex.org/W2963522749","https://openalex.org/W2963609956","https://openalex.org/W2964229681"],"related_works":["https://openalex.org/W2107680156","https://openalex.org/W2964213236","https://openalex.org/W2163719598","https://openalex.org/W3161919736","https://openalex.org/W2387018512","https://openalex.org/W4301184752","https://openalex.org/W2016620782","https://openalex.org/W2469919065","https://openalex.org/W2751422192","https://openalex.org/W1509797384"],"abstract_inverted_index":{"Generative":[0],"audio":[1,204],"models":[2,68,202],"based":[3],"on":[4],"neural":[5,126],"networks":[6,105],"have":[7,69,83],"led":[8],"to":[9,72,95,128,170],"considerable":[10],"improvements":[11,86],"across":[12],"fields":[13],"including":[14],"speech":[15],"enhancement,":[16],"source":[17],"separation,":[18],"and":[19,62,82,109],"text-to-speech":[20],"synthesis.":[21],"These":[22],"systems":[23],"are":[24,76,153],"typically":[25],"trained":[26],"in":[27,85,87,121,177,193],"a":[28,125,134,141,158],"supervised":[29],"fashion":[30],"using":[31,183],"simple":[32],"element-wise":[33],"l1":[34],"or":[35],"l2":[36],"losses.":[37],"However,":[38],"because":[39],"they":[40],"do":[41,92],"not":[42,93],"capture":[43],"properties":[44],"of":[45,57,140,199],"the":[46,58,138,197],"human":[47],"auditory":[48],"system,":[49],"such":[50,90],"losses":[51,91,192],"encourage":[52,73],"modelling":[53],"perceptually":[54,154],"meaningless":[55],"aspects":[56],"output,":[59],"wasting":[60],"capacity":[61],"limiting":[63],"performance.":[64],"Additionally,":[65],"while":[66],"adversarial":[67,104],"been":[70],"employed":[71],"outputs":[74,169],"that":[75,152],"statistically":[77],"indistinguishable":[78],"from":[79],"ground":[80],"truth":[81],"resulted":[84],"this":[88,113,145],"regard,":[89],"need":[94],"explicitly":[96],"model":[97,143],"perception":[98],"as":[99,133,174],"their":[100],"task;":[101],"furthermore,":[102],"training":[103],"remains":[106],"an":[107,117,130,184,188],"unstable":[108],"slow":[110],"process.":[111],"In":[112],"work,":[114],"we":[115,148,165],"investigate":[116],"idea":[118],"fundamentally":[119],"rooted":[120],"psychoacoustics.":[122],"We":[123],"train":[124],"network":[127],"emulate":[129],"MP3":[131,146],"codec":[132],"differentiable":[135],"function.":[136],"Feeding":[137],"output":[139],"generative":[142],"through":[144],"function,":[147],"remove":[149],"signal":[150],"components":[151],"irrelevant":[155],"before":[156],"computing":[157],"loss.":[159],"To":[160],"further":[161],"stabilize":[162],"gradient":[163],"propagation,":[164],"employ":[166],"intermediate":[167],"layer":[168],"define":[171],"our":[172],"loss,":[173],"found":[175],"useful":[176],"image":[178],"domain":[179],"methods.":[180],"Our":[181],"experiments":[182],"autoencoding":[185],"task":[186],"show":[187],"improvement":[189],"over":[190],"standard":[191],"listening":[194],"tests,":[195],"indicating":[196],"potential":[198],"psychoacoustically":[200],"motivated":[201],"for":[203],"generation.":[205]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":3}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
