{"id":"https://openalex.org/W4386794666","doi":"https://doi.org/10.48550/arxiv.2309.07462","title":"Are Large Language Model-based Evaluators the Solution to Scaling Up Multilingual Evaluation?","display_name":"Are Large Language Model-based Evaluators the Solution to Scaling Up Multilingual Evaluation?","publication_year":2023,"publication_date":"2023-09-14","ids":{"openalex":"https://openalex.org/W4386794666","doi":"https://doi.org/10.48550/arxiv.2309.07462"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2309.07462","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.07462","pdf_url":"https://arxiv.org/pdf/2309.07462","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2309.07462","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041796484","display_name":"Rishav Hada","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hada, Rishav","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040945700","display_name":"Varun Gumma","orcid":"https://orcid.org/0009-0002-5746-3017"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gumma, Varun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031169451","display_name":"Adrian de Wynter","orcid":"https://orcid.org/0000-0003-2679-7241"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"de Wynter, Adrian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059967242","display_name":"Harshita Diddee","orcid":"https://orcid.org/0000-0002-0852-7371"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Diddee, Harshita","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100620108","display_name":"Mohamed Ahmed","orcid":"https://orcid.org/0000-0002-8320-6631"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ahmed, Mohamed","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008944385","display_name":"Monojit Choudhury","orcid":"https://orcid.org/0000-0001-7473-7839"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Choudhury, Monojit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013639574","display_name":"Kalika Bali","orcid":"https://orcid.org/0000-0001-9275-742X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bali, Kalika","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5005513786","display_name":"Sunayana Sitaram","orcid":"https://orcid.org/0000-0003-4251-9719"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sitaram, Sunayana","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5041796484"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":7,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12010","display_name":"Evaluation and Performance Assessment","score":0.7305999994277954,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12010","display_name":"Evaluation and Performance Assessment","score":0.7305999994277954,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12151","display_name":"Interpreting and Communication in Healthcare","score":0.7287999987602234,"subfield":{"id":"https://openalex.org/subfields/3600","display_name":"General Health Professions"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.6767732501029968},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.549699604511261},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.44188234210014343},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1765848994255066}],"concepts":[{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.6767732501029968},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.549699604511261},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.44188234210014343},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1765848994255066},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2309.07462","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.07462","pdf_url":"https://arxiv.org/pdf/2309.07462","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2309.07462","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2309.07462","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2309.07462","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.07462","pdf_url":"https://arxiv.org/pdf/2309.07462","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.8600000143051147}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4386794666.pdf","grobid_xml":"https://content.openalex.org/works/W4386794666.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2382290278","https://openalex.org/W2478288626","https://openalex.org/W4391913857","https://openalex.org/W2350741829"],"abstract_inverted_index":{"Large":[0],"Language":[1,8],"Models":[2],"(LLMs)":[3],"excel":[4],"in":[5,16,69,94,110],"various":[6],"Natural":[7],"Processing":[9],"(NLP)":[10],"tasks,":[11,83],"yet":[12],"their":[13],"evaluation,":[14],"particularly":[15],"languages":[17],"beyond":[18],"the":[19,48,62,101],"top":[20],"$20$,":[21],"remains":[22],"inadequate":[23],"due":[24],"to":[25,35,51,116],"existing":[26],"benchmarks":[27],"and":[28,54,86,112],"metrics":[29],"limitations.":[30],"Employing":[31],"LLMs":[32],"as":[33,43],"evaluators":[34,96],"rank":[36],"or":[37],"score":[38],"other":[39],"models'":[40],"outputs":[41],"emerges":[42],"a":[44,92],"viable":[45],"solution,":[46],"addressing":[47],"constraints":[49],"tied":[50],"human":[52,78],"annotators":[53],"established":[55],"benchmarks.":[56],"In":[57],"this":[58],"study,":[59],"we":[60],"explore":[61],"potential":[63],"of":[64,103,120],"LLM-based":[65],"evaluators,":[66],"specifically":[67],"GPT-4":[68],"enhancing":[70],"multilingual":[71],"evaluation":[72,119],"by":[73],"calibrating":[74],"them":[75],"against":[76],"$20$K":[77],"judgments":[79],"across":[80,123],"three":[81],"text-generation":[82],"five":[84],"metrics,":[85],"eight":[87],"languages.":[88,125],"Our":[89],"analysis":[90],"reveals":[91],"bias":[93],"GPT4-based":[95],"towards":[97],"higher":[98],"scores,":[99],"underscoring":[100],"necessity":[102],"calibration":[104],"with":[105],"native":[106],"speaker":[107],"judgments,":[108],"especially":[109],"low-resource":[111],"non-Latin":[113],"script":[114],"languages,":[115],"ensure":[117],"accurate":[118],"LLM":[121],"performance":[122],"diverse":[124]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":2}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2023-09-16T00:00:00"}
