{"id":"https://openalex.org/W4416036340","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.914","title":"Language Models as Continuous Self-Evolving Data Engineers","display_name":"Language Models as Continuous Self-Evolving Data Engineers","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416036340","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.914"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.914","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.914","pdf_url":"https://aclanthology.org/2025.emnlp-main.914.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.914.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5062250845","display_name":"Peidong Wang","orcid":"https://orcid.org/0000-0002-7042-0209"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Peidong Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100432573","display_name":"Ming Wang","orcid":"https://orcid.org/0000-0002-6641-3700"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ming Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104108916","display_name":"Zhiming Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhiming Ma","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101508284","display_name":"Xiaocui Yang","orcid":"https://orcid.org/0000-0002-9488-6716"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaocui Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068901864","display_name":"Feng Shi","orcid":"https://orcid.org/0000-0001-5369-6695"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi Feng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035378456","display_name":"Daling Wang","orcid":"https://orcid.org/0000-0003-1340-0778"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daling Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107340701","display_name":"Yifei Zhang","orcid":"https://orcid.org/0000-0002-1950-137X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yifei Zhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5078512486","display_name":"Kaisong Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaisong Song","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5062250845"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.17263153,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"18108","last_page":"18127"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.2125999927520752,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.2125999927520752,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.10830000042915344,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.08799999952316284,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modeling-language","display_name":"Modeling language","score":0.3287999927997589},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.3273000121116638},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.3068999946117401},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.2883000075817108},{"id":"https://openalex.org/keywords/data-collection","display_name":"Data collection","score":0.27459999918937683}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.649399995803833},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35030001401901245},{"id":"https://openalex.org/C179603123","wikidata":"https://www.wikidata.org/wiki/Q1941921","display_name":"Modeling language","level":3,"score":0.3287999927997589},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.3273000121116638},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.325300008058548},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3068999946117401},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2883000075817108},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.27410000562667847},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.26100000739097595},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2590999901294708}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.914","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.914","pdf_url":"https://aclanthology.org/2025.emnlp-main.914.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.914","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.914","pdf_url":"https://aclanthology.org/2025.emnlp-main.914.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3523622274","display_name":null,"funder_award_id":"62172086","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6058138561","display_name":null,"funder_award_id":", No.","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7185304476","display_name":null,"funder_award_id":"62106039","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8666265765","display_name":null,"funder_award_id":"62272092","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416036340.pdf","grobid_xml":"https://content.openalex.org/works/W4416036340.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4],"demonstrated":[5],"remarkable":[6],"capabilities,":[7],"yet":[8],"their":[9],"further":[10],"evolution":[11],"is":[12,40,51],"often":[13],"hampered":[14],"by":[15,76],"the":[16,23,100,116],"scarcity":[17],"of":[18,26,104,118,146],"high-quality":[19,128],"training":[20],"data":[21,45,68,83,96,106,129,155,171],"and":[22,39,81,102,131,150],"heavy":[24],"reliance":[25,32,161],"traditional":[27],"methods":[28],"on":[29,36,110,162],"expert-labeled":[30],"data.This":[31],"sets":[33],"a":[34,58,177],"ceiling":[35],"LLM":[37,181],"performance":[38],"particularly":[41],"challenging":[42],"in":[43,141,184],"low":[44],"resource":[46],"scenarios":[47],"where":[48],"extensive":[49],"supervision":[50],"unavailable.To":[52],"address":[53],"this":[54],"issue,":[55],"we":[56,114],"propose":[57],"novel":[59],"paradigm":[60,157],"named":[61],"LANCE":[62,119,139],"(LANguage":[63],"models":[64,167],"as":[65,93],"Continuous":[66],"self-Evolving":[67],"engineers)":[69],"that":[70,89,124],"enables":[71],"LLMs":[72,90],"to":[73],"train":[74],"themselves":[75],"autonomously":[77],"generating,":[78],"cleaning,":[79],"reviewing,":[80],"annotating":[82],"with":[84,173,186],"preference":[85],"information.Our":[86],"approach":[87],"demonstrates":[88],"can":[91,126],"serve":[92],"continuous":[94],"self-evolving":[95],"engineers,":[97],"significantly":[98],"reducing":[99],"time":[101],"cost":[103],"post-training":[105],"construction.Through":[107],"iterative":[108],"fine-tuning":[109],"Qwen2":[111],"series":[112],"models,":[113],"validate":[115],"effectiveness":[117],"across":[120],"various":[121],"tasks,":[122],"showing":[123],"it":[125],"maintain":[127],"generation":[130],"continuously":[132],"improve":[133],"model":[134],"performance.Across":[135],"multiple":[136],"benchmark":[137],"dimensions,":[138],"results":[140],"an":[142],"average":[143],"score":[144],"enhancement":[145],"3.64":[147],"for":[148,152,180],"Qwen2-7B":[149],"1.75":[151],"Qwen2-7B-Instruct.This":[153],"autonomous":[154],"construction":[156],"not":[158],"only":[159],"lessens":[160],"human":[163,174],"experts":[164],"or":[165],"external":[166],"but":[168],"also":[169],"ensures":[170],"aligns":[172],"preferences,":[175],"offering":[176],"scalable":[178],"path":[179],"self-improvement,":[182],"especially":[183],"contexts":[185],"limited":[187],"supervisory":[188],"data.":[189]},"counts_by_year":[],"updated_date":"2026-04-18T07:56:08.524223","created_date":"2025-11-08T00:00:00"}
