{"id":"https://openalex.org/W4416036557","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.632","title":"Examining False Positives under Inference Scaling for Mathematical Reasoning","display_name":"Examining False Positives under Inference Scaling for Mathematical Reasoning","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416036557","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.632"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.632","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.632","pdf_url":"https://aclanthology.org/2025.emnlp-main.632.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.632.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100445368","display_name":"Yu Wang","orcid":"https://orcid.org/0000-0003-3511-0288"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yu Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046560716","display_name":"Nan Yang","orcid":"https://orcid.org/0000-0003-2895-5792"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nan Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100456586","display_name":"Liang Wang","orcid":"https://orcid.org/0000-0003-2719-2463"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014662947","display_name":"Furu Wei","orcid":"https://orcid.org/0000-0002-7810-5852"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Furu Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5051925942","display_name":"Fuli Feng","orcid":"https://orcid.org/0000-0002-5828-9842"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fuli Feng","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100445368"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31782085,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"12512","last_page":"12531"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11345","display_name":"Cognitive and developmental aspects of mathematical skills","score":0.21780000627040863,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11345","display_name":"Cognitive and developmental aspects of mathematical skills","score":0.21780000627040863,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.1923000067472458,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10130","display_name":"Mathematics Education and Teaching Techniques","score":0.08569999784231186,"subfield":{"id":"https://openalex.org/subfields/3304","display_name":"Education"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/false-positive-paradox","display_name":"False positive paradox","score":0.525600016117096},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5101000070571899},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.36880001425743103},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.3100000023841858},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.30379998683929443},{"id":"https://openalex.org/keywords/model-based-reasoning","display_name":"Model-based reasoning","score":0.30329999327659607}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5982000231742859},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5814999938011169},{"id":"https://openalex.org/C64869954","wikidata":"https://www.wikidata.org/wiki/Q1859747","display_name":"False positive paradox","level":2,"score":0.525600016117096},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5101000070571899},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.36880001425743103},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3646000027656555},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.3100000023841858},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.30379998683929443},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.30329999327659607},{"id":"https://openalex.org/C76969082","wikidata":"https://www.wikidata.org/wiki/Q486902","display_name":"Mathematical model","level":2,"score":0.2948000133037567},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.27709999680519104},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2624000012874603},{"id":"https://openalex.org/C95167961","wikidata":"https://www.wikidata.org/wiki/Q4483495","display_name":"Fiducial inference","level":5,"score":0.2596000134944916},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.2547000050544739}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.632","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.632","pdf_url":"https://aclanthology.org/2025.emnlp-main.632.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.632","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.632","pdf_url":"https://aclanthology.org/2025.emnlp-main.632.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416036557.pdf","grobid_xml":"https://content.openalex.org/works/W4416036557.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,10,40,67,169],"language":[3,72,106],"models":[4,45],"have":[5],"led":[6],"to":[7,144],"significant":[8],"improvements":[9],"mathematical":[11,68],"reasoning":[12,36],"across":[13,82,117],"various":[14],"benchmarks.However,":[15],"most":[16],"of":[17,63,79,87,105,162],"these":[18],"benchmarks":[19],"rely":[20],"on":[21],"automatic":[22,155],"evaluation":[23,139],"methods":[24,129],"that":[25],"only":[26],"compare":[27],"final":[28,49],"answers":[29,50],"using":[30],"heuristics,":[31],"without":[32],"verifying":[33],"the":[34,61,75,100,133,137],"underlying":[35],"steps.This":[37],"limitation":[38],"results":[39,109],"false":[41,64,113],"positive":[42,65,114],"solutions,":[43],"where":[44],"may":[46],"produce":[47],"correct":[48],"but":[51],"with":[52],"flawed":[53],"deduction":[54],"paths.In":[55],"this":[56,80],"paper,":[57],"we":[58,94,158],"systematically":[59],"examine":[60],"prevalence":[62],"solutions":[66,115],"problem":[69],"solving":[70],"for":[71],"models.We":[73],"analyze":[74,159],"characteristics":[76],"and":[77,91,121,135,165,172],"extent":[78],"issue":[81],"different":[83,118],"open-source":[84],"models,":[85,119],"datasets":[86],"varying":[88],"difficulty":[89],"levels,":[90],"decoding":[92,122],"strategies.Specifically,":[93],"explore":[95],"how":[96],"\"false":[97,145,163],"positives\"":[98,164],"influence":[99],"inference":[101,126],"time":[102,127],"scaling":[103,128,151],"behavior":[104],"models.Our":[107],"experimental":[108],"reveal":[110],"that:":[111],"(1)":[112],"persist":[116],"datasets,":[120],"methods,":[123],"(2)":[124],"sampling-based":[125],"do":[130],"not":[131],"alleviate":[132],"problem,":[134],"(3)":[136],"pass@N":[138],"metric":[140],"is":[141],"more":[142],"susceptible":[143],"positives\",":[146],"suggesting":[147],"a":[148],"significantly":[149],"lower":[150],"ceiling":[152],"than":[153],"what":[154],"evaluations":[156],"indicate.Additionally,":[157],"specific":[160],"instances":[161],"discuss":[166],"potential":[167],"limitations":[168],"self-improvement":[170],"techniques":[171],"synthetic":[173],"data":[174],"generation":[175],"under":[176],"such":[177],"conditions.":[178]},"counts_by_year":[],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2025-11-08T00:00:00"}
