{"id":"https://openalex.org/W4398234485","doi":"https://doi.org/10.1145/3626246.3653385","title":"Data-Juicer: A One-Stop Data Processing System for Large Language Models","display_name":"Data-Juicer: A One-Stop Data Processing System for Large Language Models","publication_year":2024,"publication_date":"2024-05-23","ids":{"openalex":"https://openalex.org/W4398234485","doi":"https://doi.org/10.1145/3626246.3653385"},"language":"en","primary_location":{"id":"doi:10.1145/3626246.3653385","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3626246.3653385","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion of the 2024 International Conference on Management of Data","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5090695897","display_name":"Daoyuan Chen","orcid":"https://orcid.org/0000-0002-8015-2121"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Daoyuan Chen","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022812469","display_name":"Yilun Huang","orcid":"https://orcid.org/0000-0002-1845-5535"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yilun Huang","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039564129","display_name":"Zhijian Ma","orcid":"https://orcid.org/0009-0007-8674-5351"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhijian Ma","raw_affiliation_strings":["Alibaba Group, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Beijing, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068179610","display_name":"Hesen Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hesen Chen","raw_affiliation_strings":["Alibaba Group, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Beijing, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045364186","display_name":"Xuchen Pan","orcid":"https://orcid.org/0009-0002-0081-5405"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuchen Pan","raw_affiliation_strings":["Alibaba Group, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Beijing, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049173760","display_name":"Ce Ge","orcid":"https://orcid.org/0000-0003-4312-0152"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ce Ge","raw_affiliation_strings":["Alibaba Group, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Beijing, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103243413","display_name":"Dawei Gao","orcid":"https://orcid.org/0009-0007-3882-5189"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dawei Gao","raw_affiliation_strings":["Alibaba Group, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Beijing, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064794276","display_name":"Yuexiang Xie","orcid":"https://orcid.org/0009-0005-6545-7882"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuexiang Xie","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056273654","display_name":"Zhaoyang Liu","orcid":"https://orcid.org/0000-0002-8820-5531"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhaoyang Liu","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000935850","display_name":"Jinyang Gao","orcid":"https://orcid.org/0000-0001-8247-1196"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinyang Gao","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046576694","display_name":"Yaliang Li","orcid":"https://orcid.org/0000-0002-4204-6096"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yaliang Li","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040297543","display_name":"Bolin Ding","orcid":"https://orcid.org/0000-0003-1535-9692"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bolin Ding","raw_affiliation_strings":["Alibaba Group, seattle, USA"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, seattle, USA","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057864403","display_name":"Jingren Zhou","orcid":"https://orcid.org/0000-0002-4220-2634"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingren Zhou","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5090695897"],"corresponding_institution_ids":["https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":7.7486,"has_fulltext":false,"cited_by_count":23,"citation_normalized_percentile":{"value":0.9771653,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"120","last_page":"134"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.9898999929428101,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7464851140975952},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.4338364601135254},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.3600367307662964},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.2432047724723816}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7464851140975952},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.4338364601135254},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3600367307662964},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.2432047724723816}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3626246.3653385","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3626246.3653385","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion of the 2024 International Conference on Management of Data","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W398859631","https://openalex.org/W1566289585","https://openalex.org/W2012833704","https://openalex.org/W2081193615","https://openalex.org/W2192203593","https://openalex.org/W2556522401","https://openalex.org/W2962739339","https://openalex.org/W2963250244","https://openalex.org/W3006942207","https://openalex.org/W3081168214","https://openalex.org/W3156008561","https://openalex.org/W3156533729","https://openalex.org/W3197876970","https://openalex.org/W3198659451","https://openalex.org/W4205991051","https://openalex.org/W4230804723","https://openalex.org/W4362655426","https://openalex.org/W4385569771","https://openalex.org/W4385572845","https://openalex.org/W4401863753"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"The":[0],"immense":[1],"evolution":[2],"in":[3,37,89],"Large":[4],"Language":[5],"Models":[6],"(LLMs)":[7],"has":[8],"underscored":[9],"the":[10,57,111,142],"importance":[11],"of":[12,24,59,145],"massive,":[13],"heterogeneous,":[14],"and":[15,66,93,122,154],"high-quality":[16],"data.":[17],"A":[18],"data":[19,25,45,52,62,84,91,113,117,135,157],"recipe":[20],"is":[21,129],"a":[22,34,72],"mixture":[23],"from":[26,63,101],"different":[27,87,156],"sources":[28,114],"for":[29,43,50,115],"training":[30],"LLMs,":[31,60],"which":[32,78],"plays":[33],"vital":[35],"role":[36],"LLMs'":[38,68,139],"performance.":[39,99,140],"Existing":[40],"open-source":[41],"tools":[42],"LLM":[44],"processing":[46],"are":[47,119],"mostly":[48],"tailored":[49],"specific":[51],"recipes.":[53,158],"To":[54],"continuously":[55],"uncover":[56],"potential":[58],"incorporate":[61],"new":[64,73],"sources,":[65],"improve":[67],"performance,":[69],"we":[70,79],"build":[71],"system":[74],"named":[75],"Data-Juicer,":[76,146],"with":[77,124],"can":[80],"efficiently":[81],"generate":[82],"diverse":[83],"recipes,":[85],"explore":[86],"possibilities":[88],"forming":[90,116],"mixtures,":[92],"evaluate":[94,134,155],"their":[95],"effects":[96],"on":[97,138],"model":[98,147],"Different":[100],"traditional":[102],"data-analytics":[103],"pipelines,":[104],"Data-Juicer":[105],"faces":[106],"some":[107],"unique":[108],"challenges.":[109],"Firstly,":[110],"possible":[112],"recipes":[118],"truly":[120],"heterogeneous":[121],"massive":[123],"various":[125],"qualities.":[126],"Secondly,":[127],"it":[128],"extremely":[130],"expensive":[131],"to":[132,152],"precisely":[133],"recipes'":[136],"impact":[137],"Thirdly,":[141],"end":[143],"users":[144],"developers,":[148],"need":[149],"sufficient":[150],"flexibility":[151],"configure":[153]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":17},{"year":2024,"cited_by_count":4}],"updated_date":"2026-04-16T08:26:57.006410","created_date":"2025-10-10T00:00:00"}
