{"id":"https://openalex.org/W4400525527","doi":"https://doi.org/10.1145/3626772.3657675","title":"Towards Robust QA Evaluation via Open LLMs","display_name":"Towards Robust QA Evaluation via Open LLMs","publication_year":2024,"publication_date":"2024-07-10","ids":{"openalex":"https://openalex.org/W4400525527","doi":"https://doi.org/10.1145/3626772.3657675"},"language":"en","primary_location":{"id":"doi:10.1145/3626772.3657675","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3626772.3657675","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5043026875","display_name":"Ehsan Kamalloo","orcid":"https://orcid.org/0000-0003-3081-8762"},"institutions":[{"id":"https://openalex.org/I151746483","display_name":"University of Waterloo","ror":"https://ror.org/01aff2v68","country_code":"CA","type":"education","lineage":["https://openalex.org/I151746483"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Ehsan Kamalloo","raw_affiliation_strings":["University of Waterloo, Waterloo, Canada"],"affiliations":[{"raw_affiliation_string":"University of Waterloo, Waterloo, Canada","institution_ids":["https://openalex.org/I151746483"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103308060","display_name":"Shivani Upadhyay","orcid":"https://orcid.org/0009-0007-7071-2344"},"institutions":[{"id":"https://openalex.org/I151746483","display_name":"University of Waterloo","ror":"https://ror.org/01aff2v68","country_code":"CA","type":"education","lineage":["https://openalex.org/I151746483"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Shivani Upadhyay","raw_affiliation_strings":["University of Waterloo, Waterloo, Canada"],"affiliations":[{"raw_affiliation_string":"University of Waterloo, Waterloo, Canada","institution_ids":["https://openalex.org/I151746483"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5082997975","display_name":"Jimmy Lin","orcid":"https://orcid.org/0000-0002-0661-7189"},"institutions":[{"id":"https://openalex.org/I151746483","display_name":"University of Waterloo","ror":"https://ror.org/01aff2v68","country_code":"CA","type":"education","lineage":["https://openalex.org/I151746483"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Jimmy Lin","raw_affiliation_strings":["University of Waterloo, Waterloo, Canada"],"affiliations":[{"raw_affiliation_string":"University of Waterloo, Waterloo, Canada","institution_ids":["https://openalex.org/I151746483"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5043026875"],"corresponding_institution_ids":["https://openalex.org/I151746483"],"apc_list":null,"apc_paid":null,"fwci":4.4396,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.95070216,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"2811","last_page":"2816"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9570000171661377,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9570000171661377,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9413999915122986,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9373000264167786,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5537892580032349}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5537892580032349}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3626772.3657675","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3626772.3657675","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320323817","display_name":"Universitas Brawijaya","ror":"https://ror.org/01wk3d929"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W1583837637","https://openalex.org/W2951434086","https://openalex.org/W2963748441","https://openalex.org/W2979826702","https://openalex.org/W2983309655","https://openalex.org/W2999905431","https://openalex.org/W3091824185","https://openalex.org/W3099700870","https://openalex.org/W3102659883","https://openalex.org/W3156789018","https://openalex.org/W3196194504","https://openalex.org/W3206455169","https://openalex.org/W3214546336","https://openalex.org/W3214608568","https://openalex.org/W4205456754","https://openalex.org/W4225992558","https://openalex.org/W4238846128","https://openalex.org/W4285225959","https://openalex.org/W4287122359","https://openalex.org/W4385569780","https://openalex.org/W4385572714","https://openalex.org/W4387994989","https://openalex.org/W4392681182","https://openalex.org/W6600007113","https://openalex.org/W6600388300","https://openalex.org/W6600511658","https://openalex.org/W6602430550"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"Instruction-tuned":[0],"large":[1],"language":[2],"models":[3],"(LLMs)":[4],"have":[5],"been":[6],"shown":[7],"to":[8,49,77,105,130],"be":[9],"viable":[10],"surrogates":[11],"for":[12,82],"the":[13,59,66],"widely":[14],"used,":[15],"albeit":[16],"overly":[17],"rigid,":[18],"lexical":[19],"matching":[20],"metrics":[21],"in":[22],"evaluating":[23],"question":[24],"answering":[25],"(QA)":[26],"models.":[27,136],"However,":[28,87],"these":[29],"LLM-based":[30,70,84],"evaluation":[31,109,115],"methods":[32],"are":[33,45,141],"invariably":[34],"based":[35,117],"on":[36,118],"proprietary":[37,43,93],"LLMs.":[38],"Despite":[39],"their":[40,55,62,92],"remarkable":[41],"capabilities,":[42],"LLMs":[44,81,89],"costly":[46],"and":[47,64,120,134,139],"subject":[48],"internal":[50],"changes":[51],"that":[52,113],"can":[53],"affect":[54],"output,":[56],"which":[57],"inhibits":[58],"reproducibility":[60],"of":[61,69],"results":[63],"limits":[65],"widespread":[67],"adoption":[68],"evaluation.":[71,86],"In":[72],"this":[73,97],"demo,":[74],"we":[75],"aim":[76],"use":[78],"publicly":[79],"available":[80,142],"standardizing":[83],"QA":[85],"open-source":[88],"lag":[90],"behind":[91],"counterparts.":[94],"We":[95,111],"overcome":[96],"gap":[98],"by":[99],"adopting":[100],"chain-of-thought":[101],"prompting":[102],"with":[103,126],"self-consistency":[104],"build":[106],"a":[107],"reliable":[108],"framework.":[110],"demonstrate":[112],"our":[114],"framework,":[116],"750M":[119],"7B":[121],"open":[122],"LLMs,":[123],"correlates":[124],"competitively":[125],"human":[127],"judgment,":[128],"compared":[129],"most":[131],"recent":[132],"GPT-3":[133],"GPT-4":[135],"Our":[137],"codebase":[138],"data":[140],"at":[143],"https://github.com/castorini/qa-eval.":[144]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
