{"id":"https://openalex.org/W4416034818","doi":"https://doi.org/10.18653/v1/2025.findings-emnlp.616","title":"Structure Trumps Size: Rethinking Data Quality for LLM Reasoning","display_name":"Structure Trumps Size: Rethinking Data Quality for LLM Reasoning","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416034818","doi":"https://doi.org/10.18653/v1/2025.findings-emnlp.616"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.findings-emnlp.616","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.616","pdf_url":"https://aclanthology.org/2025.findings-emnlp.616.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.findings-emnlp.616.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100385219","display_name":"Xu Hu","orcid":"https://orcid.org/0000-0002-6345-2448"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hu Xu","raw_affiliation_strings":["School of Computer Science Shanghai Jiao Tong University Koguan School of Law China Institute for Smart Justice School of Computer Science Shanghai Jiao Tong University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science Shanghai Jiao Tong University Koguan School of Law China Institute for Smart Justice School of Computer Science Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063988681","display_name":"Zeyan Li","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zeyan Li","raw_affiliation_strings":["School of Computer Science Shanghai Jiao Tong University Koguan School of Law China Institute for Smart Justice School of Computer Science Shanghai Jiao Tong University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science Shanghai Jiao Tong University Koguan School of Law China Institute for Smart Justice School of Computer Science Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100431149","display_name":"Rui Wang","orcid":"https://orcid.org/0000-0001-8007-2503"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Wang","raw_affiliation_strings":["School of Computer Science Shanghai Jiao Tong University Koguan School of Law China Institute for Smart Justice School of Computer Science Shanghai Jiao Tong University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science Shanghai Jiao Tong University Koguan School of Law China Institute for Smart Justice School of Computer Science Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101652521","display_name":"Jianfeng Xu","orcid":"https://orcid.org/0000-0001-6929-3815"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianfeng Xu","raw_affiliation_strings":["School of Computer Science Shanghai Jiao Tong University Koguan School of Law China Institute for Smart Justice School of Computer Science Shanghai Jiao Tong University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science Shanghai Jiao Tong University Koguan School of Law China Institute for Smart Justice School of Computer Science Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35081318,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"11489","last_page":"11513"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.8345999717712402,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.8345999717712402,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.02370000071823597,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.009200000204145908,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/data-quality","display_name":"Data quality","score":0.4447000026702881},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4056999981403351},{"id":"https://openalex.org/keywords/data-collection","display_name":"Data collection","score":0.33090001344680786},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.28780001401901245},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.27570000290870667}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6381000280380249},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.4447000026702881},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4056999981403351},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4025000035762787},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.33090001344680786},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.28780001401901245},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2597000002861023},{"id":"https://openalex.org/C33762810","wikidata":"https://www.wikidata.org/wiki/Q461671","display_name":"Data integrity","level":2,"score":0.24969999492168427},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.2433999925851822}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.findings-emnlp.616","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.616","pdf_url":"https://aclanthology.org/2025.findings-emnlp.616.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.findings-emnlp.616","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.616","pdf_url":"https://aclanthology.org/2025.findings-emnlp.616.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416034818.pdf","grobid_xml":"https://content.openalex.org/works/W4416034818.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"domain-specific":[1],"datasets":[2,96],"continue":[3],"to":[4],"expand,":[5],"Large":[6],"Language":[7],"Models":[8],"(LLMs)":[9],"have":[10],"achieved":[11],"significant":[12],"improvements":[13],"across":[14],"various":[15],"fields":[16],"through":[17],"supervised":[18],"fine-tuning":[19,139],"(SFT).However,":[20],"is":[21],"more":[22,60],"data":[23,55,76,90,124,135],"always":[24],"better":[25],"for":[26,74,88,145],"model":[27],"fine-tuning?Through":[28],"a":[29,42],"series":[30],"of":[31],"controlled":[32,102],"experiments,":[33,103],"we":[34,81,104],"discover":[35],"that":[36,53,106],"dataset":[37,148],"structure-rather":[38],"than":[39],"mere":[40],"size-plays":[41],"decisive":[43],"role":[44],"in":[45,118,137],"enhancing":[46],"LLM":[47,138],"reasoning":[48,89,128],"capabilities.While":[49],"existing":[50],"methods":[51],"acknowledge":[52],"good":[54],"quality":[56],"can":[57],"make":[58],"training":[59],"efficient,":[61,146],"they":[62],"primarily":[63],"rely":[64],"on":[65,125],"simple":[66],"heuristic":[67],"strategies":[68],"and":[69,94,140],"lack":[70],"systematic,":[71],"quantitative":[72,86,143],"frameworks":[73],"evaluating":[75],"quality.To":[77],"address":[78],"this":[79],"gap,":[80],"introduce":[82],"MCSQ-the":[83],"first":[84],"multi-dimensional":[85],"framework":[87],"management.MCSQ":[91],"rigorously":[92],"evaluates":[93],"optimizes":[95],"along":[97],"six":[98],"orthogonal":[99],"dimensions.Through":[100],"comprehensive":[101],"find":[105],"selectively":[107],"incorporating":[108],"\"distorted\"":[109],"(model-disagreed)":[110],"or":[111],"\"mismatched\"":[112],"(low-relevance)":[113],"samples-which":[114],"are":[115],"typically":[116],"discarded":[117],"traditional":[119,132],"approaches-can":[120],"outperform":[121],"conventional":[122],"\"clean\"":[123],"certain":[126],"advanced":[127],"benchmarks.Our":[129],"findings":[130],"challenge":[131],"assumptions":[133],"about":[134],"\"quality\"":[136],"provide":[141],"actionable,":[142],"guidance":[144],"structure-aware":[147],"management.":[149]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-11-08T00:00:00"}
