{"id":"https://openalex.org/W4416037405","doi":"https://doi.org/10.18653/v1/2025.emnlp-industry.43","title":"More Data or Better Data? A Critical Analysis of Data Selection and Synthesis for Mathematical Reasoning","display_name":"More Data or Better Data? A Critical Analysis of Data Selection and Synthesis for Mathematical Reasoning","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416037405","doi":"https://doi.org/10.18653/v1/2025.emnlp-industry.43"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-industry.43","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-industry.43","pdf_url":"https://aclanthology.org/2025.emnlp-industry.43.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-industry.43.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5037654460","display_name":"Yijie Zhao","orcid":"https://orcid.org/0000-0002-2202-8376"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yike Zhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101580861","display_name":"Simin Guo","orcid":"https://orcid.org/0000-0002-5729-616X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Simin Guo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001241589","display_name":"Ziqing Yang","orcid":"https://orcid.org/0000-0002-0492-0862"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ziqing Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018437726","display_name":"Shifan Han","orcid":"https://orcid.org/0000-0003-4448-1689"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shifan Han","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010087030","display_name":"Dahua Lin","orcid":"https://orcid.org/0000-0002-8865-7896"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dahua Lin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5032121414","display_name":"Fei Tan","orcid":"https://orcid.org/0000-0001-7057-581X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fei Tan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5037654460"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35692294,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"618","last_page":"629"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10130","display_name":"Mathematics Education and Teaching Techniques","score":0.6191999912261963,"subfield":{"id":"https://openalex.org/subfields/3304","display_name":"Education"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10130","display_name":"Mathematics Education and Teaching Techniques","score":0.6191999912261963,"subfield":{"id":"https://openalex.org/subfields/3304","display_name":"Education"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11875","display_name":"Statistics Education and Methodologies","score":0.04520000144839287,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11345","display_name":"Cognitive and developmental aspects of mathematical skills","score":0.031700000166893005,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.48010000586509705},{"id":"https://openalex.org/keywords/feature-selection","display_name":"Feature selection","score":0.29649999737739563},{"id":"https://openalex.org/keywords/model-selection","display_name":"Model selection","score":0.2660999894142151},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.2549999952316284},{"id":"https://openalex.org/keywords/data-analysis","display_name":"Data analysis","score":0.251800000667572}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6312000155448914},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5209000110626221},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.48010000586509705},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3522000014781952},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.34599998593330383},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C175801342","wikidata":"https://www.wikidata.org/wiki/Q1988917","display_name":"Data analysis","level":2,"score":0.251800000667572},{"id":"https://openalex.org/C76969082","wikidata":"https://www.wikidata.org/wiki/Q486902","display_name":"Mathematical model","level":2,"score":0.2517000138759613}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-industry.43","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-industry.43","pdf_url":"https://aclanthology.org/2025.emnlp-industry.43.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-industry.43","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-industry.43","pdf_url":"https://aclanthology.org/2025.emnlp-industry.43.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416037405.pdf","grobid_xml":"https://content.openalex.org/works/W4416037405.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"reasoning":[1,142],"capabilities":[2],"of":[3,22,45],"Large":[4],"Language":[5],"Models":[6],"(LLMs)":[7],"play":[8],"a":[9,42,58],"critical":[10],"role":[11],"in":[12,33,87],"many":[13],"downstream":[14],"tasks,":[15],"yet":[16],"depend":[17],"strongly":[18],"on":[19,131],"the":[20],"quality":[21],"training":[23,64,109],"data.Despite":[24],"various":[25],"proposed":[26],"data":[27,49,71,86,101,110,118],"construction":[28],"methods,":[29],"their":[30],"practical":[31,76],"utility":[32],"real-world":[34,141],"pipelines":[35],"remains":[36],"underexplored.In":[37],"this":[38,125],"work,":[39],"we":[40],"conduct":[41],"comprehensive":[43],"analysis":[44],"open-source":[46],"datasets":[47],"and":[48,65,74,120],"synthesis":[50],"techniques":[51],"for":[52,79,107,140],"mathematical":[53],"reasoning,":[54],"evaluating":[55],"them":[56],"under":[57],"unified":[59],"pipeline":[60],"designed":[61],"to":[62,111,133],"mirror":[63],"deployment":[66],"scenarios.We":[67],"further":[68,129],"distill":[69],"effective":[70],"selection":[72],"strategies":[73],"identify":[75],"methods":[77],"suitable":[78],"industrial":[80],"applications.Our":[81],"findings":[82],"highlight":[83],"that":[84],"structuring":[85],"more":[88],"interpretable":[89],"formats,":[90],"or":[91],"distilling":[92],"from":[93],"stronger":[94],"models":[95],"often":[96],"outweighs":[97],"simply":[98],"scaling":[99],"up":[100],"volume.This":[102],"study":[103],"provides":[104],"actionable":[105],"guidance":[106],"integrating":[108],"enhance":[112],"LLM":[113],"capabilities,":[114],"supporting":[115],"both":[116],"cost-effective":[117],"curation":[119],"scalable":[121],"model":[122],"enhancement.We":[123],"hope":[124],"work":[126],"will":[127],"inspire":[128],"research":[130],"how":[132],"balance":[134],"\"more":[135],"data\"":[136,139],"versus":[137],"\"better":[138],"tasks.":[143]},"counts_by_year":[],"updated_date":"2026-03-12T06:13:28.667946","created_date":"2025-11-08T00:00:00"}
