{"id":"https://openalex.org/W4415725257","doi":"https://doi.org/10.1007/s10462-025-11403-7","title":"Datasets for large language models: a comprehensive survey","display_name":"Datasets for large language models: a comprehensive survey","publication_year":2025,"publication_date":"2025-10-31","ids":{"openalex":"https://openalex.org/W4415725257","doi":"https://doi.org/10.1007/s10462-025-11403-7"},"language":"en","primary_location":{"id":"doi:10.1007/s10462-025-11403-7","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10462-025-11403-7","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10462-025-11403-7.pdf","source":{"id":"https://openalex.org/S122814990","display_name":"Artificial Intelligence Review","issn_l":"0269-2821","issn":["0269-2821","1573-7462"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Artificial Intelligence Review","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s10462-025-11403-7.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100657670","display_name":"Yang Liu","orcid":"https://orcid.org/0000-0002-1297-676X"},"institutions":[{"id":"https://openalex.org/I4210098034","display_name":"Key Laboratory of Guangdong Province","ror":"https://ror.org/00swtqp09","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210098034"]},{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yang Liu","raw_affiliation_strings":["INTSIG-SCUT Joint Lab on Document Analysis and Recognition, Guangzhou, 510640, Guangdong, China","School of Electronic and Information Engineering, South China University of Technology, Guangzhou, 510640, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"INTSIG-SCUT Joint Lab on Document Analysis and Recognition, Guangzhou, 510640, Guangdong, China","institution_ids":["https://openalex.org/I4210098034"]},{"raw_affiliation_string":"School of Electronic and Information Engineering, South China University of Technology, Guangzhou, 510640, Guangdong, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102702850","display_name":"Jiahuan Cao","orcid":null},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiahuan Cao","raw_affiliation_strings":["School of Electronic and Information Engineering, South China University of Technology, Guangzhou, 510640, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"School of Electronic and Information Engineering, South China University of Technology, Guangzhou, 510640, Guangdong, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037101589","display_name":"Chongyu Liu","orcid":"https://orcid.org/0000-0003-2516-926X"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chongyu Liu","raw_affiliation_strings":["School of Electronic and Information Engineering, South China University of Technology, Guangzhou, 510640, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"School of Electronic and Information Engineering, South China University of Technology, Guangzhou, 510640, Guangdong, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101873754","display_name":"Kai Ding","orcid":"https://orcid.org/0000-0002-4214-1923"},"institutions":[{"id":"https://openalex.org/I4210089783","display_name":"Shanghai Medical Information Center","ror":"https://ror.org/007wz9933","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210089783"]},{"id":"https://openalex.org/I4210098034","display_name":"Key Laboratory of Guangdong Province","ror":"https://ror.org/00swtqp09","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210098034"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Ding","raw_affiliation_strings":["INTSIG Information Co., Ltd, Shanghai, 200040, Shanghai, China","INTSIG-SCUT Joint Lab on Document Analysis and Recognition, Guangzhou, 510640, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"INTSIG Information Co., Ltd, Shanghai, 200040, Shanghai, China","institution_ids":["https://openalex.org/I4210089783"]},{"raw_affiliation_string":"INTSIG-SCUT Joint Lab on Document Analysis and Recognition, Guangzhou, 510640, Guangdong, China","institution_ids":["https://openalex.org/I4210098034"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5080674767","display_name":"Lianwen Jin","orcid":"https://orcid.org/0000-0002-5456-0957"},"institutions":[{"id":"https://openalex.org/I4210098034","display_name":"Key Laboratory of Guangdong Province","ror":"https://ror.org/00swtqp09","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210098034"]},{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lianwen Jin","raw_affiliation_strings":["INTSIG-SCUT Joint Lab on Document Analysis and Recognition, Guangzhou, 510640, Guangdong, China","School of Electronic and Information Engineering, South China University of Technology, Guangzhou, 510640, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"INTSIG-SCUT Joint Lab on Document Analysis and Recognition, Guangzhou, 510640, Guangdong, China","institution_ids":["https://openalex.org/I4210098034"]},{"raw_affiliation_string":"School of Electronic and Information Engineering, South China University of Technology, Guangzhou, 510640, Guangdong, China","institution_ids":["https://openalex.org/I90610280"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100657670"],"corresponding_institution_ids":["https://openalex.org/I4210098034","https://openalex.org/I90610280"],"apc_list":{"value":2490,"currency":"EUR","value_usd":3090},"apc_paid":{"value":2490,"currency":"EUR","value_usd":3090},"fwci":42.3379,"has_fulltext":true,"cited_by_count":19,"citation_normalized_percentile":{"value":0.99757477,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"58","issue":"12","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.30149999260902405,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.30149999260902405,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.2802000045776367,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.19840000569820404,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.620199978351593},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.42260000109672546},{"id":"https://openalex.org/keywords/order","display_name":"Order (exchange)","score":0.4120999872684479},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.28949999809265137},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.28130000829696655}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7753000259399414},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.6665999889373779},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.620199978351593},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.42260000109672546},{"id":"https://openalex.org/C182306322","wikidata":"https://www.wikidata.org/wiki/Q1779371","display_name":"Order (exchange)","level":2,"score":0.4120999872684479},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.28949999809265137},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C171078966","wikidata":"https://www.wikidata.org/wiki/Q111029","display_name":"Root (linguistics)","level":2,"score":0.28110000491142273},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.2786000072956085},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2689000070095062}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s10462-025-11403-7","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10462-025-11403-7","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10462-025-11403-7.pdf","source":{"id":"https://openalex.org/S122814990","display_name":"Artificial Intelligence Review","issn_l":"0269-2821","issn":["0269-2821","1573-7462"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Artificial Intelligence Review","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s10462-025-11403-7","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10462-025-11403-7","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10462-025-11403-7.pdf","source":{"id":"https://openalex.org/S122814990","display_name":"Artificial Intelligence Review","issn_l":"0269-2821","issn":["0269-2821","1573-7462"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Artificial Intelligence Review","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1428435317","display_name":null,"funder_award_id":"Grant No.:","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2504063345","display_name":null,"funder_award_id":"6244160","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5270964542","display_name":null,"funder_award_id":"62441604","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6561198578","display_name":null,"funder_award_id":"62476093","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6852825479","display_name":null,"funder_award_id":"61936003","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4415725257.pdf","grobid_xml":"https://content.openalex.org/works/W4415725257.grobid-xml"},"referenced_works_count":102,"referenced_works":["https://openalex.org/W1566289585","https://openalex.org/W2064675550","https://openalex.org/W2068882115","https://openalex.org/W2101105183","https://openalex.org/W2604580630","https://openalex.org/W2792505675","https://openalex.org/W2890431379","https://openalex.org/W2891555348","https://openalex.org/W2923014074","https://openalex.org/W2988937804","https://openalex.org/W3002330681","https://openalex.org/W3015453090","https://openalex.org/W3035016936","https://openalex.org/W3035390927","https://openalex.org/W3093956460","https://openalex.org/W3101223450","https://openalex.org/W3105882417","https://openalex.org/W3114651185","https://openalex.org/W3152515526","https://openalex.org/W3169113923","https://openalex.org/W3169341408","https://openalex.org/W3169483174","https://openalex.org/W3176750236","https://openalex.org/W3196731672","https://openalex.org/W3198845875","https://openalex.org/W3201174429","https://openalex.org/W3204112174","https://openalex.org/W4224928124","https://openalex.org/W4239025652","https://openalex.org/W4285178342","https://openalex.org/W4285294723","https://openalex.org/W4293057253","https://openalex.org/W4311887664","https://openalex.org/W4324142688","https://openalex.org/W4378189609","https://openalex.org/W4381930847","https://openalex.org/W4384071683","https://openalex.org/W4385568240","https://openalex.org/W4385570984","https://openalex.org/W4385571124","https://openalex.org/W4385571157","https://openalex.org/W4385571633","https://openalex.org/W4385572001","https://openalex.org/W4385572634","https://openalex.org/W4385572697","https://openalex.org/W4385572845","https://openalex.org/W4385572906","https://openalex.org/W4385572953","https://openalex.org/W4385573018","https://openalex.org/W4385573116","https://openalex.org/W4385734111","https://openalex.org/W4385894687","https://openalex.org/W4387093135","https://openalex.org/W4387847108","https://openalex.org/W4389403907","https://openalex.org/W4389518608","https://openalex.org/W4389518761","https://openalex.org/W4389518784","https://openalex.org/W4389519019","https://openalex.org/W4389519042","https://openalex.org/W4389519248","https://openalex.org/W4389519287","https://openalex.org/W4389519291","https://openalex.org/W4389519431","https://openalex.org/W4389519438","https://openalex.org/W4389519598","https://openalex.org/W4389519602","https://openalex.org/W4389519979","https://openalex.org/W4389520259","https://openalex.org/W4389520703","https://openalex.org/W4389524372","https://openalex.org/W4391136507","https://openalex.org/W4392384758","https://openalex.org/W4393147120","https://openalex.org/W4393147146","https://openalex.org/W4393153123","https://openalex.org/W4393160809","https://openalex.org/W4401042685","https://openalex.org/W4401042689","https://openalex.org/W4401043132","https://openalex.org/W4402057292","https://openalex.org/W4402670280","https://openalex.org/W4402670999","https://openalex.org/W4402671258","https://openalex.org/W4402671286","https://openalex.org/W4402671302","https://openalex.org/W4402671569","https://openalex.org/W4402671766","https://openalex.org/W4402671800","https://openalex.org/W4402684046","https://openalex.org/W4402684121","https://openalex.org/W4404515031","https://openalex.org/W4404780958","https://openalex.org/W4404783199","https://openalex.org/W4404783465","https://openalex.org/W4407097799","https://openalex.org/W4411113094","https://openalex.org/W4411119522","https://openalex.org/W4411630291","https://openalex.org/W4411630296","https://openalex.org/W4411630326","https://openalex.org/W4412158322"],"related_works":[],"abstract_inverted_index":{"This":[0],"paper":[1],"embarks":[2],"on":[3,115],"an":[4],"exploration":[5],"into":[6,77,159],"the":[7,19,28,40,60,89,116,132,160,185],"large":[8],"language":[9,147],"model":[10],"(LLM)":[11],"datasets,":[12,72,144,191],"which":[13],"play":[14],"a":[15,33,51,64,128,194],"crucial":[16],"role":[17],"in":[18,54,199],"remarkable":[20],"advancements":[21],"of":[22,42,46,63,70,92,131,188],"LLMs.":[23,43],"The":[24,111,163],"datasets":[25,48,94],"serve":[26],"as":[27,50,193],"foundational":[29],"infrastructure":[30],"analogous":[31],"to":[32,58,74,183,204],"root":[34],"system":[35],"that":[36],"sustains":[37],"and":[38,67,73,81,87,119,149,174,202],"nurtures":[39],"development":[41],"Consequently,":[44],"examination":[45],"these":[47],"emerges":[49],"critical":[52],"topic":[53],"research.":[55],"In":[56],"order":[57],"address":[59],"current":[61,79],"lack":[62],"comprehensive":[65,129,195],"overview":[66],"thorough":[68],"analysis":[69],"LLM":[71,93,189],"gain":[75],"insights":[76],"their":[78],"status":[80],"future":[82,125,205],"trends,":[83],"this":[84,200],"survey":[85,112],"consolidates":[86],"categorizes":[88],"fundamental":[90],"aspects":[91],"from":[95,142,154],"four":[96],"perspectives:":[97],"(a)":[98],"pre-training":[99,172],"corpora;":[100],"(b)":[101],"instruction":[102],"fine-tuning":[103],"datasets;":[104,107],"(c)":[105],"preference":[106],"(d)":[108],"evaluation":[109],"datasets.":[110,180],"sheds":[113],"light":[114],"prevailing":[117],"challenges":[118],"points":[120],"out":[121],"potential":[122],"avenues":[123],"for":[124,171,178,197],"investigation.":[126],"Additionally,":[127],"review":[130],"existing":[133],"available":[134,210],"dataset":[135,161],"resources":[136,208],"is":[137,157],"also":[138],"provided,":[139],"including":[140],"statistics":[141],"303":[143],"covering":[145],"8":[146],"categories":[148],"spanning":[150],"32":[151],"domains.":[152],"Information":[153],"20":[155],"dimensions":[156],"incorporated":[158],"statistics.":[162],"total":[164],"data":[165],"size":[166],"surveyed":[167],"surpasses":[168],"774.5":[169],"TB":[170],"corpora":[173],"700":[175],"M":[176],"instances":[177],"other":[179],"We":[181],"aim":[182],"present":[184],"entire":[186],"landscape":[187],"text":[190],"serving":[192],"reference":[196],"researchers":[198],"field":[201],"contributing":[203],"studies.":[206],"Related":[207],"are":[209],"at:":[211],"https://github.com/lmmlzn/Awesome-LLMs-Datasets":[212],".":[213]},"counts_by_year":[{"year":2026,"cited_by_count":9},{"year":2025,"cited_by_count":10}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-31T00:00:00"}
