{"id":"https://openalex.org/W4406891895","doi":"https://doi.org/10.1109/fllm63129.2024.10852500","title":"Using LLMs for Evaluating QA Systems: Exploration and Assessment","display_name":"Using LLMs for Evaluating QA Systems: Exploration and Assessment","publication_year":2024,"publication_date":"2024-11-26","ids":{"openalex":"https://openalex.org/W4406891895","doi":"https://doi.org/10.1109/fllm63129.2024.10852500"},"language":"en","primary_location":{"id":"doi:10.1109/fllm63129.2024.10852500","is_oa":false,"landing_page_url":"https://doi.org/10.1109/fllm63129.2024.10852500","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 2nd International Conference on Foundation and Large Language Models (FLLM)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040781316","display_name":"Hadel Alhawasi","orcid":null},"institutions":[{"id":"https://openalex.org/I193531525","display_name":"George Washington University","ror":"https://ror.org/00y4zzh67","country_code":"US","type":"education","lineage":["https://openalex.org/I193531525"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Hadel Alhawasi","raw_affiliation_strings":["The George Washington University,Department of Computer Science,Washington,DC,USA"],"affiliations":[{"raw_affiliation_string":"The George Washington University,Department of Computer Science,Washington,DC,USA","institution_ids":["https://openalex.org/I193531525"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5111635384","display_name":"Abdou Youssef","orcid":null},"institutions":[{"id":"https://openalex.org/I193531525","display_name":"George Washington University","ror":"https://ror.org/00y4zzh67","country_code":"US","type":"education","lineage":["https://openalex.org/I193531525"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Abdou Youssef","raw_affiliation_strings":["The George Washington University,Department of Computer Science,Washington,DC,USA"],"affiliations":[{"raw_affiliation_string":"The George Washington University,Department of Computer Science,Washington,DC,USA","institution_ids":["https://openalex.org/I193531525"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5040781316"],"corresponding_institution_ids":["https://openalex.org/I193531525"],"apc_list":null,"apc_paid":null,"fwci":0.6848,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.73318391,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"462","last_page":"469"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10876","display_name":"Fault Detection and Control Systems","score":0.8033000230789185,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10876","display_name":"Fault Detection and Control Systems","score":0.8033000230789185,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5396547317504883}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5396547317504883}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/fllm63129.2024.10852500","is_oa":false,"landing_page_url":"https://doi.org/10.1109/fllm63129.2024.10852500","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 2nd International Conference on Foundation and Large Language Models (FLLM)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W4312438588","https://openalex.org/W4367692219","https://openalex.org/W4385572754","https://openalex.org/W4389519254","https://openalex.org/W4392773006","https://openalex.org/W6621906925","https://openalex.org/W6677328238","https://openalex.org/W6682631176","https://openalex.org/W6755207826","https://openalex.org/W6761205521","https://openalex.org/W6767186723","https://openalex.org/W6798182279","https://openalex.org/W6805607268","https://openalex.org/W6838998875","https://openalex.org/W6849557284","https://openalex.org/W6850357939","https://openalex.org/W6850603086","https://openalex.org/W6850936240","https://openalex.org/W6852252866","https://openalex.org/W6854866820","https://openalex.org/W6856707472","https://openalex.org/W6856883136","https://openalex.org/W6860041859","https://openalex.org/W6898505805"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"The":[0,89,98,105,123,144,159,232],"evaluation":[1,128,152,157,256,283],"process":[2],"of":[3,77,118,295,316],"detecting":[4],"the":[5,75,81,109,132,136,169,177,188,196,202,205,220,249,254,264,273,277,286,293,314,320],"similarity":[6,64,82,137,255],"between":[7,83,138],"reference":[8,87,142],"text":[9,15,60],"and":[10,32,54,61,86,95,111,121,141,173,195,219,245,259,300,330],"Large":[11],"Language":[12],"Model":[13],"(LLM)-generated":[14],"is":[16,21,171,198,222],"a":[17,43,236],"challenging":[18],"problem.":[19],"It":[20],"difficult":[22],"to":[23,37,73,79,102,134,166,239,242,247,325],"measure":[24,103],"it":[25,148],"automatically.":[26],"Traditional":[27],"metrics,":[28],"such":[29],"as":[30,192,216,305,328,332],"ROUGE":[31],"BERTScore,":[33],"have":[34,38,55,150],"been":[35],"shown":[36],"some":[39],"limitations.":[40],"They":[41],"showed":[42,280,289],"relatively":[44],"low":[45],"correlation":[46],"with":[47,253,257],"humans.":[48],"Also,":[49,176],"they":[50],"penalize":[51],"LLM-generated":[52,84,139],"text,":[53],"difficulty":[56],"recognizing":[57],"noise":[58],"in":[59,65,185,201,292,313],"qualitatively":[62],"evaluate":[63,80,135],"texts.In":[66],"this":[67],"paper,":[68],"we":[69],"introduce":[70],"an":[71,151],"approach":[72,90,233],"studying":[74],"use":[76,115,235],"LLM":[78,133],"answers":[85,140],"answers.":[88,143],"framework":[91,99],"includes":[92],"classes,":[93],"prompt,":[94,146],"classification":[96,160,165,178,197,221,237,266,275,279,288],"schemes.":[97],"defines":[100],"classes":[101,110,189,206,246],"similarity.":[104],"prompt":[106,125],"will":[107,149,154,234],"include":[108],"their":[112],"definitions.":[113],"We":[114,251],"two":[116,182],"types":[117],"prompts:":[119],"instructed":[120,124],"uninstructed.":[122],"contains":[126],"specific":[127],"rules":[129],"for":[130],"conducting":[131],"uninstructed":[145],"though":[147],"request,":[153],"not":[155,208],"contain":[156],"rules.":[158],"schemes":[161,179],"range":[162],"from":[163],"binary":[164,265],"multi-class":[167,274],"classification;":[168],"latter":[170],"finer-grained":[172],"more":[174],"informative.":[175],"group":[180],"into":[181],"categories,":[183],"where":[184],"one":[186],"category":[187,204],"are":[190,207],"viewed":[191,215],"mutually":[193,210],"exclusive":[194,211],"unguided,":[199],"while":[200,285,312],"other":[203],"all":[209],"but":[212],"can":[213],"be":[214,326],"partly":[217],"hierarchical,":[218],"guided,":[223],"i.e.,":[224],"totally":[225,309],"ordered":[226,229,310,318],"or":[227],"partially":[228,317],"(a":[230],"hybrid).":[231],"scheme":[238,267],"allow":[240],"LLMs":[241],"invoke":[243],"prompts":[244],"perform":[248,302],"evaluation.":[250],"experimented":[252],"GPT-4":[258,301,327],"Gemini.Our":[260],"results":[261],"indicate":[262],"that":[263],"shows":[268],"significant":[269],"accuracy":[270],"results.":[271],"In":[272],"schemes,":[276],"unguided":[278],"very":[281,303],"poor":[282],"performance,":[284],"guided":[287],"excellent":[290],"performance":[291],"90-percentiles":[294],"accuracy.":[296],"Furthermore,":[297],"both":[298],"Gemini":[299,331],"well":[304],"evaluators,":[306],"especially":[307],"under":[308],"classification,":[311,319],"case":[315],"winning":[321],"combination":[322],"turned":[323],"out":[324],"evaluator":[329],"answerer.":[333]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
