{"id":"https://openalex.org/W7138470213","doi":"https://doi.org/10.1609/aaai.v40i31.39808","title":"A Novel Approach to Evaluating Evaluation Metrics for Multi-Output Structured Prediction","display_name":"A Novel Approach to Evaluating Evaluation Metrics for Multi-Output Structured Prediction","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138470213","doi":"https://doi.org/10.1609/aaai.v40i31.39808"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i31.39808","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i31.39808","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i31.39808","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129687922","display_name":"Akshay Vyas","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Akshay Vyas","raw_affiliation_strings":["University of Texas at Dallas"],"affiliations":[{"raw_affiliation_string":"University of Texas at Dallas","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125741450","display_name":"Angelo Pimienta","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Angelo Pimienta","raw_affiliation_strings":["University of Texas at Dallas"],"affiliations":[{"raw_affiliation_string":"University of Texas at Dallas","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017484756","display_name":"Nicholas Ruozzi","orcid":"https://orcid.org/0000-0002-4262-2698"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nicholas Ruozzi","raw_affiliation_strings":["University of Texas at Dallas"],"affiliations":[{"raw_affiliation_string":"University of Texas at Dallas","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5129687922"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.87014925,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"31","first_page":"26062","last_page":"26071"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5687999725341797,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5687999725341797,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.12489999830722809,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.08150000125169754,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.7204999923706055},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6714000105857849},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.6489999890327454},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5975000262260437},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5694000124931335},{"id":"https://openalex.org/keywords/ground-truth","display_name":"Ground truth","score":0.5023999810218811},{"id":"https://openalex.org/keywords/structured-prediction","display_name":"Structured prediction","score":0.4041999876499176}],"concepts":[{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.7204999923706055},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7062000036239624},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6714000105857849},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.6489999890327454},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5975000262260437},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5694000124931335},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5259000062942505},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.522599995136261},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.5023999810218811},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.4041999876499176},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.37709999084472656},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.3395000100135803},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.3328999876976013},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.3012999892234802},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2727000117301941},{"id":"https://openalex.org/C80519477","wikidata":"https://www.wikidata.org/wiki/Q3532236","display_name":"Scenario testing","level":3,"score":0.26820001006126404},{"id":"https://openalex.org/C82214349","wikidata":"https://www.wikidata.org/wiki/Q657339","display_name":"Software metric","level":5,"score":0.25529998540878296},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.25279998779296875}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i31.39808","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i31.39808","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i31.39808","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i31.39808","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0,114],"multi-output":[1],"structured":[2],"prediction":[3],"tasks,":[4],"while":[5,133],"only":[6,138],"one":[7],"ground":[8],"truth":[9],"label":[10],"may":[11,22],"be":[12,23],"provided":[13],"in":[14,43,49,106,129],"the":[15,157],"training":[16],"data,":[17],"multiple":[18],"equally":[19],"valid":[20],"outputs":[21,108],"possible,":[24],"making":[25],"reliable":[26],"evaluation":[27],"a":[28,63,77,95],"persistent":[29],"challenge.":[30],"We":[31,87],"postulate":[32],"that":[33,119],"human":[34],"evaluators":[35],"implicitly":[36],"use":[37,70,88],"task-specific":[38,67],"invariants,":[39],"e.g.,":[40],"object":[41],"boundaries":[42],"colorized":[44],"images":[45],"or":[46,164],"named":[47],"entities":[48],"translations,":[50],"to":[51,75,97,150],"judge":[52],"if":[53],"an":[54,110,123],"output":[55],"is":[56],"acceptable.":[57],"Under":[58],"this":[59,141],"assumption,":[60],"we":[61,117,144,155],"introduce":[62],"notion":[64],"of":[65,79,94],"approximate":[66],"invariants":[68,91,105,121],"and":[69,84,154],"them":[71],"as":[72,92],"diagnostic":[73],"tools":[74],"evaluate":[76],"variety":[78],"existing":[80],"metrics":[81,132,136,148,162],"for":[82],"vision":[83],"language":[85],"tasks.":[86],"these":[89],"task":[90],"part":[93],"framework":[96],"systematically":[98],"test":[99],"metric":[100],"reliability":[101],"by":[102],"encouraging":[103],"domain-relevant":[104],"model":[107],"via":[109],"augmented":[111,124],"loss":[112,125],"function.":[113],"our":[115],"experiments,":[116],"observe":[118],"enforcing":[120],"with":[122],"yields":[126],"substantial":[127],"improvements":[128],"popular":[130],"distributional":[131,161],"more":[134],"traditional":[135],"change":[137],"marginally.":[139],"Through":[140],"invariants-driven":[142],"evaluation,":[143],"expose":[145],"where":[146],"standard":[147],"fail":[149],"detect":[151],"meaningful":[152],"differences,":[153],"highlight":[156],"conditions":[158],"under":[159],"which":[160],"succeed":[163],"still":[165],"fall":[166],"short.":[167]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
