{"id":"https://openalex.org/W4403536522","doi":"https://doi.org/10.1145/3691620.3695061","title":"What Makes a High-Quality Training Dataset for Large Language Models: A Practitioners' Perspective","display_name":"What Makes a High-Quality Training Dataset for Large Language Models: A Practitioners' Perspective","publication_year":2024,"publication_date":"2024-10-18","ids":{"openalex":"https://openalex.org/W4403536522","doi":"https://doi.org/10.1145/3691620.3695061"},"language":"en","primary_location":{"id":"doi:10.1145/3691620.3695061","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3691620.3695061","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 39th IEEE/ACM International Conference on Automated Software Engineering","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047060039","display_name":"Xiao Yu","orcid":"https://orcid.org/0000-0002-4473-3068"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiao Yu","raw_affiliation_strings":["Huawei, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Huawei, Hangzhou, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031694064","display_name":"Zexian Zhang","orcid":"https://orcid.org/0009-0005-7235-1170"},"institutions":[{"id":"https://openalex.org/I196699116","display_name":"Wuhan University of Technology","ror":"https://ror.org/03fe7t173","country_code":"CN","type":"education","lineage":["https://openalex.org/I196699116"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zexian Zhang","raw_affiliation_strings":["School of Computer Science and Artificial Intelligence, Wuhan University of Technology, Wuhan, China","Wuhan University of Technology Chongqing Research Institute, Chongqing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Artificial Intelligence, Wuhan University of Technology, Wuhan, China","institution_ids":["https://openalex.org/I196699116"]},{"raw_affiliation_string":"Wuhan University of Technology Chongqing Research Institute, Chongqing, China","institution_ids":["https://openalex.org/I196699116"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065153631","display_name":"Feifei Niu","orcid":"https://orcid.org/0000-0002-4123-4554"},"institutions":[{"id":"https://openalex.org/I153718931","display_name":"University of Ottawa","ror":"https://ror.org/03c4mmv16","country_code":"CA","type":"education","lineage":["https://openalex.org/I153718931"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Feifei Niu","raw_affiliation_strings":["School of Electrical Engineering and Computer Science, University of Ottawa, Ottawa, Canada"],"affiliations":[{"raw_affiliation_string":"School of Electrical Engineering and Computer Science, University of Ottawa, Ottawa, Canada","institution_ids":["https://openalex.org/I153718931"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047688641","display_name":"Xing Hu","orcid":"https://orcid.org/0000-0003-0093-3292"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xing Hu","raw_affiliation_strings":["The State Key Laboratory of Blockchain and Data Security, Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"The State Key Laboratory of Blockchain and Data Security, Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006669765","display_name":"Xin Xia","orcid":"https://orcid.org/0000-0002-6302-3256"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Xia","raw_affiliation_strings":["Huawei, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Huawei, Hangzhou, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5082913979","display_name":"John Grundy","orcid":null},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"John Grundy","raw_affiliation_strings":["Faculty of Information Technology, Monash University, Victoria, Australia"],"affiliations":[{"raw_affiliation_string":"Faculty of Information Technology, Monash University, Victoria, Australia","institution_ids":["https://openalex.org/I56590836"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5047060039"],"corresponding_institution_ids":["https://openalex.org/I2250955327"],"apc_list":null,"apc_paid":null,"fwci":6.2008,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.9665088,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"656","last_page":"668"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9876000285148621,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.8010203838348389},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7542845010757446},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6293609142303467},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.6262152194976807},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.44480592012405396},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4185892343521118},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.38670986890792847},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.37598901987075806},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.36564314365386963}],"concepts":[{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.8010203838348389},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7542845010757446},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6293609142303467},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.6262152194976807},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.44480592012405396},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4185892343521118},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38670986890792847},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.37598901987075806},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.36564314365386963},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3691620.3695061","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3691620.3695061","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 39th IEEE/ACM International Conference on Automated Software Engineering","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6800000071525574,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G5567504548","display_name":null,"funder_award_id":"cstc2021jcyj-msxmX1115","funder_id":"https://openalex.org/F4320323172","funder_display_name":"Natural Science Foundation of Chongqing"}],"funders":[{"id":"https://openalex.org/F4320323172","display_name":"Natural Science Foundation of Chongqing","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W166457893","https://openalex.org/W1484727516","https://openalex.org/W1567491469","https://openalex.org/W1990028388","https://openalex.org/W2011044158","https://openalex.org/W2034334856","https://openalex.org/W2066045860","https://openalex.org/W2076646346","https://openalex.org/W2136082883","https://openalex.org/W2165394871","https://openalex.org/W2290195878","https://openalex.org/W2474835145","https://openalex.org/W2523518582","https://openalex.org/W2954606160","https://openalex.org/W2969465322","https://openalex.org/W2979792666","https://openalex.org/W3096218984","https://openalex.org/W3136987292","https://openalex.org/W3160042717","https://openalex.org/W3202367862","https://openalex.org/W3212551653","https://openalex.org/W4281763794","https://openalex.org/W4308643318","https://openalex.org/W4323966606","https://openalex.org/W4366683629","https://openalex.org/W4385572142","https://openalex.org/W4388483497","https://openalex.org/W4388483649","https://openalex.org/W4389519225","https://openalex.org/W4389672268","https://openalex.org/W4391558520","https://openalex.org/W4391828195","https://openalex.org/W4391974543","https://openalex.org/W6921297175"],"related_works":["https://openalex.org/W2899084033","https://openalex.org/W230091440","https://openalex.org/W2233261550","https://openalex.org/W2810751659","https://openalex.org/W258997015","https://openalex.org/W2149537132","https://openalex.org/W2018871932","https://openalex.org/W2997094352","https://openalex.org/W641279757","https://openalex.org/W370975646"],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4],"demonstrated":[5],"remarkable":[6],"performance":[7],"in":[8],"various":[9],"application":[10],"domains,":[11],"largely":[12],"due":[13],"to":[14,112,202],"their":[15,37,123],"self-supervised":[16],"pre-training":[17],"on":[18,190],"extensive":[19],"high-quality":[20,54,69,154,204],"text":[21],"datasets.":[22],"However,":[23],"despite":[24],"the":[25,114,126,139,195],"importance":[26,115],"of":[27,36,50,68,116,153],"constructing":[28],"such":[29],"datasets,":[30,72],"many":[31],"leading":[32],"LLMs":[33],"lack":[34],"documentation":[35],"dataset":[38,56],"construction":[39],"and":[40,81,130,137,176,199],"training":[41,55,71,205],"procedures,":[42],"leaving":[43],"LLM":[44,70,93,99,155],"practitioners":[45,100,200],"with":[46,90,182],"a":[47,53,120,159],"limited":[48],"understanding":[49],"what":[51],"makes":[52],"for":[57,122,197,207],"LLMs.":[58,209],"To":[59],"fill":[60],"this":[61],"gap,":[62],"we":[63,148,193],"initially":[64],"identified":[65,149,171],"18":[66],"characteristics":[67,152],"as":[73,75],"well":[74],"10":[76],"potential":[77],"data":[78,83,128,131,174,177],"pre-processing":[79,129,175],"methods":[80,134],"6":[82],"quality":[84,132,178],"assessment":[85,133,179],"methods,":[86,180],"through":[87],"detailed":[88],"interviews":[89],"13":[91,150],"experienced":[92],"professionals.":[94],"We":[95,107,169],"then":[96],"surveyed":[97],"219":[98],"from":[101],"23":[102],"countries":[103],"across":[104],"5":[105],"continents.":[106],"asked":[108],"our":[109,146,191],"survey":[110],"respondents":[111],"rate":[113],"these":[117,143,187],"characteristics,":[118],"provide":[119],"rationale":[121,165],"ratings,":[124],"specify":[125],"key":[127,164],"they":[135],"used,":[136],"highlight":[138],"challenges":[140,184],"encountered":[141,185],"during":[142,186],"processes.":[144,188],"From":[145],"analysis,":[147],"crucial":[151],"datasets":[156,206],"that":[157],"receive":[158],"high":[160],"rating,":[161],"accompanied":[162],"by":[163,167],"provided":[166],"respondents.":[168],"also":[170],"some":[172],"widely-used":[173],"along":[181],"7":[183],"Based":[189],"findings,":[192],"discuss":[194],"implications":[196],"researchers":[198],"aiming":[201],"construct":[203],"optimizing":[208]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":10}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2025-10-10T00:00:00"}
