{"id":"https://openalex.org/W4393971165","doi":"https://doi.org/10.1145/3640544.3645216","title":"EvaluLLM: LLM assisted evaluation of generative outputs","display_name":"EvaluLLM: LLM assisted evaluation of generative outputs","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4393971165","doi":"https://doi.org/10.1145/3640544.3645216"},"language":"en","primary_location":{"id":"doi:10.1145/3640544.3645216","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3640544.3645216","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3640544.3645216","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the 29th International Conference on Intelligent User Interfaces","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3640544.3645216","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5056038276","display_name":"Michael Desmond","orcid":"https://orcid.org/0000-0002-1796-1161"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Michael Desmond","raw_affiliation_strings":["IBM Research, United States"],"affiliations":[{"raw_affiliation_string":"IBM Research, United States","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003440919","display_name":"Zahra Ashktorab","orcid":"https://orcid.org/0000-0002-0686-7911"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zahra Ashktorab","raw_affiliation_strings":["IBM Research, United States"],"affiliations":[{"raw_affiliation_string":"IBM Research, United States","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064662859","display_name":"Qian Pan","orcid":"https://orcid.org/0000-0002-0437-1736"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qian Pan","raw_affiliation_strings":["IBM Research, United States"],"affiliations":[{"raw_affiliation_string":"IBM Research, United States","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082995130","display_name":"Casey Dugan","orcid":"https://orcid.org/0000-0002-1508-2091"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Casey Dugan","raw_affiliation_strings":["IBM Research, United States"],"affiliations":[{"raw_affiliation_string":"IBM Research, United States","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5082721686","display_name":"James M. Johnson","orcid":"https://orcid.org/0000-0002-7199-5493"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"James M. Johnson","raw_affiliation_strings":["IBM Research, United States"],"affiliations":[{"raw_affiliation_string":"IBM Research, United States","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5056038276"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":10.9091,"has_fulltext":true,"cited_by_count":30,"citation_normalized_percentile":{"value":0.98664267,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"30","last_page":"32"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9739000201225281,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8441706895828247},{"id":"https://openalex.org/keywords/natural-language-generation","display_name":"Natural language generation","score":0.7374985814094543},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6989376544952393},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.5590944290161133},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5582759380340576},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.5411106944084167},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5343030691146851},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5197803378105164},{"id":"https://openalex.org/keywords/measure","display_name":"Measure (data warehouse)","score":0.42648836970329285},{"id":"https://openalex.org/keywords/evaluation-methods","display_name":"Evaluation methods","score":0.4216546416282654},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.3466784358024597},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.27442029118537903},{"id":"https://openalex.org/keywords/systems-engineering","display_name":"Systems engineering","score":0.16895535588264465},{"id":"https://openalex.org/keywords/reliability-engineering","display_name":"Reliability engineering","score":0.16602814197540283},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.07287049293518066}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8441706895828247},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.7374985814094543},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6989376544952393},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5590944290161133},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5582759380340576},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.5411106944084167},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5343030691146851},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5197803378105164},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.42648836970329285},{"id":"https://openalex.org/C3018395757","wikidata":"https://www.wikidata.org/wiki/Q1379672","display_name":"Evaluation methods","level":2,"score":0.4216546416282654},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3466784358024597},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.27442029118537903},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.16895535588264465},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.16602814197540283},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.07287049293518066},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3640544.3645216","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3640544.3645216","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3640544.3645216","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the 29th International Conference on Intelligent User Interfaces","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3640544.3645216","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3640544.3645216","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3640544.3645216","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the 29th International Conference on Intelligent User Interfaces","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4393971165.pdf","grobid_xml":"https://content.openalex.org/works/W4393971165.grobid-xml"},"referenced_works_count":1,"referenced_works":["https://openalex.org/W1593271688"],"related_works":["https://openalex.org/W2955859849","https://openalex.org/W2152921782","https://openalex.org/W382594479","https://openalex.org/W2470045054","https://openalex.org/W2575772232","https://openalex.org/W2151245229","https://openalex.org/W2140902089","https://openalex.org/W1510553545","https://openalex.org/W3020827637","https://openalex.org/W199086061"],"abstract_inverted_index":{"With":[0],"the":[1,16,83,148,153],"rapid":[2],"improvement":[3],"in":[4],"large":[5],"language":[6,23],"model":[7,171],"(LLM)":[8],"capabilities,":[9],"its":[10],"becoming":[11],"more":[12],"difficult":[13,62],"to":[14,36,63,68,105,164],"measure":[15],"quality":[17,75],"of":[18,85,115,131,135],"outputs":[19,137],"generated":[20,59,136],"by":[21],"natural":[22],"generation":[24],"(NLG)":[25],"systems.":[26],"Conventional":[27],"metrics":[28],"such":[29],"as":[30,72,87,121,128],"BLEU":[31],"and":[32,39,66,70,74,91,110,151,170],"ROUGE":[33],"are":[34,40,94],"bound":[35],"reference":[37],"data,":[38],"generally":[41],"unsuitable":[42],"for":[43,155],"tasks":[44],"that":[45],"require":[46],"creative":[47],"or":[48],"diverse":[49],"outputs.":[50],"Human":[51],"evaluation":[52,112,143,149],"is":[53,61,126],"an":[54,102,119],"option,":[55],"but":[56],"manually":[57],"evaluating":[58],"text":[60],"do":[64],"well,":[65],"expensive":[67],"scale":[69],"repeat":[71],"requirements":[73],"criteria":[76],"change.":[77],"Recent":[78],"work":[79],"has":[80],"focused":[81],"on":[82,139],"use":[84],"LLMs":[86],"customize-able":[88],"NLG":[89,116],"evaluators,":[90],"initial":[92],"results":[93],"promising.":[95],"In":[96],"this":[97],"demonstration":[98],"we":[99],"present":[100],"EvaluLLM,":[101],"application":[103],"designed":[104],"help":[106],"practitioners":[107],"setup,":[108],"run":[109],"review":[111],"over":[113],"sets":[114],"outputs,":[117],"using":[118],"LLM":[120],"a":[122,129,140],"custom":[123],"evaluator.":[124],"Evaluation":[125],"formulated":[127],"series":[130],"choices":[132],"between":[133],"pairs":[134],"conditioned":[138],"user":[141],"provided":[142],"criteria.":[144],"This":[145],"approach":[146],"simplifies":[147],"task":[150],"obviates":[152],"need":[154],"complex":[156],"scoring":[157],"algorithms.":[158],"The":[159],"system":[160],"can":[161],"be":[162],"applied":[163],"general":[165],"evaluation,":[166,169],"human":[167],"assisted":[168],"selection":[172],"problems.":[173]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":20},{"year":2024,"cited_by_count":7}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
