{"id":"https://openalex.org/W4416036141","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.988","title":"Mind the Gap: A Closer Look at Tokenization for Multiple-Choice Question Answering with LLMs","display_name":"Mind the Gap: A Closer Look at Tokenization for Multiple-Choice Question Answering with LLMs","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416036141","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.988"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.988","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.988","pdf_url":"https://aclanthology.org/2025.emnlp-main.988.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.988.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5117865138","display_name":"Mario Sanz-Guerrero","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Mario Sanz-Guerrero","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050750082","display_name":"Minh Duc Bui","orcid":"https://orcid.org/0000-0002-0756-7136"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Minh Duc Bui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5093081493","display_name":"Katharina von der Wense","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Katharina von der Wense","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5117865138"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18410159,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"19584","last_page":"19594"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.6870999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.6870999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.04349999874830246,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13274","display_name":"Expert finding and Q&A systems","score":0.03759999945759773,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.6991999745368958},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.397599995136261},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.2800999879837036},{"id":"https://openalex.org/keywords/circumscription","display_name":"Circumscription","score":0.2773999869823456}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.6991999745368958},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.489300012588501},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.397599995136261},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3903999924659729},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3547999858856201},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.3474000096321106},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3230000138282776},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.28139999508857727},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.2800999879837036},{"id":"https://openalex.org/C62360110","wikidata":"https://www.wikidata.org/wiki/Q96777007","display_name":"Circumscription","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C193035329","wikidata":"https://www.wikidata.org/wiki/Q17007046","display_name":"Closed-ended question","level":2,"score":0.26600000262260437},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.25519999861717224},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.25279998779296875}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.988","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.988","pdf_url":"https://aclanthology.org/2025.emnlp-main.988.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.988","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.988","pdf_url":"https://aclanthology.org/2025.emnlp-main.988.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416036141.pdf","grobid_xml":"https://content.openalex.org/works/W4416036141.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"When":[0],"evaluating":[1],"large":[2],"language":[3],"models":[4],"(LLMs)":[5],"with":[6,18,95],"multiple-choice":[7],"question":[8],"answering":[9],"(MCQA),":[10],"it":[11,108],"is":[12,31],"common":[13],"to":[14,22,36,57,60,86,137],"end":[15],"the":[16,19,38,41,75,92,96,113,116,122,130],"prompt":[17],"string":[20],"\"Answer:\"":[21],"facilitate":[23],"automated":[24],"answer":[25,97],"extraction":[26],"via":[27],"next-token":[28],"probabilities.However,":[29],"there":[30],"no":[32],"consensus":[33],"on":[34],"how":[35],"tokenize":[37],"space":[39,93],"following":[40],"colon,":[42],"often":[43],"overlooked":[44],"as":[45,66,68],"a":[46],"trivial":[47],"choice.In":[48],"this":[49,61],"paper,":[50],"we":[51,83,100],"uncover":[52],"accuracy":[53],"differences":[54],"of":[55,77,115,124],"up":[56],"11%":[58],"due":[59],"(seemingly":[62],"irrelevant)":[63],"tokenization":[64],"variation":[65],"well":[67],"reshuffled":[69],"model":[70,110],"rankings,":[71],"raising":[72],"concerns":[73],"about":[74],"reliability":[76,114],"LLM":[78],"comparisons":[79],"in":[80],"prior":[81],"work.Surprisingly,":[82],"are":[84],"able":[85],"recommend":[87],"one":[88],"specific":[89],"strategy":[90],"-tokenizing":[91],"together":[94],"letter":[98],"-as":[99],"observe":[101],"consistent":[102],"and":[103,128,140],"statistically":[104],"significant":[105],"performance":[106],"improvements.Additionally,":[107],"improves":[109],"calibration,":[111],"enhancing":[112],"model's":[117],"confidence":[118],"estimates.Our":[119],"findings":[120],"underscore":[121],"importance":[123],"careful":[125],"evaluation":[126,135],"design":[127],"highlight":[129],"need":[131],"for":[132],"standardized,":[133],"transparent":[134],"protocols":[136],"ensure":[138],"reliable":[139],"comparable":[141],"results.":[142]},"counts_by_year":[],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2025-11-08T00:00:00"}
