{"id":"https://openalex.org/W4412945542","doi":"https://doi.org/10.18653/v1/2025.acl-long.466","title":"Efficient Pretraining Data Selection for Language Models via Multi-Actor Collaboration","display_name":"Efficient Pretraining Data Selection for Language Models via Multi-Actor Collaboration","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412945542","doi":"https://doi.org/10.18653/v1/2025.acl-long.466"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.acl-long.466","is_oa":false,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.466","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102781208","display_name":"Tianyi Bai","orcid":"https://orcid.org/0009-0009-5057-7100"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tianyi Bai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115592400","display_name":"Ling Yang","orcid":"https://orcid.org/0009-0008-8832-9682"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ling Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093123138","display_name":"Zhen Hao Wong","orcid":"https://orcid.org/0009-0009-1757-5322"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhen Hao Wong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040806847","display_name":"Fupeng Sun","orcid":"https://orcid.org/0000-0003-4574-1817"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fupeng Sun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054159373","display_name":"Xinlin Zhuang","orcid":"https://orcid.org/0009-0006-1822-8224"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xinlin Zhuang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101707851","display_name":"Jiahui Peng","orcid":"https://orcid.org/0000-0002-4936-7626"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiahui Peng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102010202","display_name":"Chi Zhang","orcid":"https://orcid.org/0000-0002-6142-120X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chi Zhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078745995","display_name":"Lyndia C. Wu","orcid":"https://orcid.org/0000-0002-8236-032X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lijun Wu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111396982","display_name":"Qiu Jiantao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiu Jiantao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100459878","display_name":"Wentao Zhang","orcid":"https://orcid.org/0000-0003-4549-7498"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wentao Zhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002684888","display_name":"Binhang Yuan","orcid":"https://orcid.org/0000-0002-3188-2769"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Binhang Yuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101615091","display_name":"Conghui He","orcid":"https://orcid.org/0000-0001-8697-695X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Conghui He","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.08892866,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"9465","last_page":"9491"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9383999705314636,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9383999705314636,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.9293000102043152,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9106000065803528,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7170016169548035},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.7143877744674683},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.455808162689209},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.42594215273857117},{"id":"https://openalex.org/keywords/knowledge-management","display_name":"Knowledge management","score":0.33513662219047546},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3312135338783264},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3230859637260437}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7170016169548035},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.7143877744674683},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.455808162689209},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42594215273857117},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.33513662219047546},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3312135338783264},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3230859637260437}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.18653/v1/2025.acl-long.466","is_oa":false,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.466","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-148899","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-148899","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4205762803","https://openalex.org/W2535856026","https://openalex.org/W2265065644","https://openalex.org/W2134699697","https://openalex.org/W3017188156","https://openalex.org/W2322875716","https://openalex.org/W2383516975","https://openalex.org/W2374878784","https://openalex.org/W2147679489","https://openalex.org/W2371642785"],"abstract_inverted_index":{"Efficient":[0],"data":[1,21,36,50,54,59,82,130],"selection":[2,37,51,55],"is":[3,87],"crucial":[4],"to":[5,19,33,89,116,145,153],"accelerate":[6],"the":[7,27,70,74,91,107,154],"pretraining":[8,109],"of":[9,73,93],"language":[10,149],"model":[11,150],"(LMs).":[12],"While":[13],"various":[14,97],"methods":[15],"have":[16],"been":[17],"proposed":[18],"enhance":[20],"efficiency,":[22,131],"limited":[23],"research":[24],"has":[25],"addressed":[26],"inherent":[28],"conflicts":[29],"between":[30],"these":[31],"approaches":[32],"achieve":[34],"optimal":[35],"for":[38,81],"LM":[39,108,135],"pretraining.":[40],"To":[41],"tackle":[42],"this":[43],"problem,":[44],"we":[45],"propose":[46],"a":[47,85],"multi-actor":[48,119],"collaborative":[49],"mechanism:":[52],"each":[53],"method":[56],"independently":[57],"prioritizes":[58],"based":[60],"on":[61],"its":[62,66],"criterion":[63],"and":[64,84,99,137,158],"updates":[65],"prioritization":[67],"rules":[68],"using":[69],"current":[71],"state":[72],"model,":[75],"functioning":[76],"as":[77],"an":[78,139],"independent":[79],"actor":[80],"selection;":[83],"console":[86],"designed":[88],"adjust":[90],"impacts":[92],"different":[94],"actors":[95,105],"at":[96,163],"stages":[98],"dynamically":[100],"integrate":[101],"information":[102],"from":[103],"all":[104],"throughout":[106],"process.":[110],"We":[111],"conduct":[112],"extensive":[113],"empirical":[114],"studies":[115],"evaluate":[117],"our":[118,126],"framework.":[120],"The":[121],"experimental":[122],"results":[123],"demonstrate":[124],"that":[125],"approach":[127],"significantly":[128],"improves":[129],"accelerates":[132],"convergence":[133],"in":[134],"pretraining,":[136],"achieves":[138],"average":[140],"relative":[141],"performance":[142],"gain":[143],"up":[144],"10.5%":[146],"across":[147],"multiple":[148],"benchmarks":[151],"compared":[152],"state-of-the-art":[155],"methods.":[156],"Code":[157],"checkpoints":[159],"are":[160],"publicly":[161],"released":[162],"https://github.com/Relaxed-System-Lab/multi-actor-data-selection.":[164]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
