{"id":"https://openalex.org/W4400525623","doi":"https://doi.org/10.1145/3626772.3657671","title":"An Integrated Data Processing Framework for Pretraining Foundation Models","display_name":"An Integrated Data Processing Framework for Pretraining Foundation Models","publication_year":2024,"publication_date":"2024-07-10","ids":{"openalex":"https://openalex.org/W4400525623","doi":"https://doi.org/10.1145/3626772.3657671"},"language":"en","primary_location":{"id":"doi:10.1145/3626772.3657671","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3626772.3657671","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103326225","display_name":"Yiding Sun","orcid":"https://orcid.org/0009-0004-1671-5016"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yiding Sun","raw_affiliation_strings":["Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0004-1671-5016","affiliations":[{"raw_affiliation_string":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China","institution_ids":["https://openalex.org/I78988378"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103326343","display_name":"Feng Wang","orcid":"https://orcid.org/0009-0006-5146-1562"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feng Wang","raw_affiliation_strings":["Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-5146-1562","affiliations":[{"raw_affiliation_string":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China","institution_ids":["https://openalex.org/I78988378"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007186741","display_name":"Yutao Zhu","orcid":"https://orcid.org/0000-0002-9432-3251"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yutao Zhu","raw_affiliation_strings":["Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-9432-3251","affiliations":[{"raw_affiliation_string":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China","institution_ids":["https://openalex.org/I78988378"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037145565","display_name":"Wayne Xin Zhao","orcid":"https://orcid.org/0000-0002-8333-6196"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wayne Xin Zhao","raw_affiliation_strings":["Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-8333-6196","affiliations":[{"raw_affiliation_string":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China","institution_ids":["https://openalex.org/I78988378"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5072119199","display_name":"Jiaxin Mao","orcid":"https://orcid.org/0000-0002-9257-5498"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiaxin Mao","raw_affiliation_strings":["Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-9257-5498","affiliations":[{"raw_affiliation_string":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China","institution_ids":["https://openalex.org/I78988378"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5103326225"],"corresponding_institution_ids":["https://openalex.org/I78988378"],"apc_list":null,"apc_paid":null,"fwci":0.7142,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.70137939,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"2713","last_page":"2718"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.9908000230789185,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9871000051498413,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.8181388974189758},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6773345470428467}],"concepts":[{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.8181388974189758},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6773345470428467},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3626772.3657671","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3626772.3657671","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W2147717514","https://openalex.org/W2611669587","https://openalex.org/W2747329762","https://openalex.org/W2963015836","https://openalex.org/W2981852735","https://openalex.org/W3169113923","https://openalex.org/W3177765786","https://openalex.org/W3212368439","https://openalex.org/W4206410067","https://openalex.org/W4224308101","https://openalex.org/W4225591000","https://openalex.org/W4288076474","https://openalex.org/W4288089799","https://openalex.org/W4308760226","https://openalex.org/W4311642023","https://openalex.org/W4322718191","https://openalex.org/W4377121615","https://openalex.org/W4379251438","https://openalex.org/W4384918448","https://openalex.org/W4385889719","https://openalex.org/W4386184788","https://openalex.org/W4386556635","https://openalex.org/W4386557438","https://openalex.org/W4386655575","https://openalex.org/W4388927919","https://openalex.org/W6810081322","https://openalex.org/W7093349750"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2381393187","https://openalex.org/W2332779545","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W2358060160","https://openalex.org/W2035483685","https://openalex.org/W4396701345","https://openalex.org/W2376932109"],"abstract_inverted_index":{"The":[0,94,146],"ability":[1],"of":[2,72,75,90],"the":[3,91,128,143],"foundation":[4],"models":[5],"heavily":[6],"relies":[7],"on":[8,153],"large-scale,":[9],"diverse,":[10],"and":[11,22,33,53,81,88,101,121,137,148],"high-quality":[12],"pretraining":[13,142],"data.":[14,93],"In":[15,104],"order":[16],"to":[17,26,99,112],"improve":[18],"data":[19,36,41,46,62,129],"quality,":[20],"researchers":[21],"practitioners":[23],"often":[24],"have":[25],"manually":[27],"curate":[28],"datasets":[29],"from":[30],"difference":[31],"sources":[32],"develop":[34],"dedicated":[35],"cleansing":[37],"pipeline":[38],"for":[39],"each":[40],"repository.":[42],"Lacking":[43],"a":[44,61,67,73],"unified":[45],"processing":[47,63],"framework,":[48],"this":[49,57,105,114],"process":[50],"is":[51,97],"repetitive":[52],"cumbersome.":[54],"To":[55],"mitigate":[56],"issue,":[58],"we":[59,108],"propose":[60],"framework":[64,96,115],"that":[65],"integrates":[66],"Processing":[68],"Module":[69,84],"which":[70,85],"consists":[71],"series":[74],"operators":[76],"at":[77],"different":[78],"granularity":[79],"levels,":[80],"an":[82,132,138],"Analyzing":[83],"supports":[86],"probing":[87],"evaluation":[89,134,140],"refined":[92],"proposed":[95],"easy":[98],"use":[100,113,119],"highly":[102],"flexible.":[103],"demo":[106],"paper,":[107],"first":[109],"introduce":[110],"how":[111],"with":[116,131,135],"some":[117],"example":[118],"cases":[120],"then":[122],"demonstrate":[123],"its":[124],"effectiveness":[125],"in":[126,141],"improving":[127],"quality":[130],"automated":[133],"ChatGPT":[136],"end-to-end":[139],"GPT-2":[144],"model.":[145],"code":[147],"demonstration":[149],"video":[150],"are":[151],"accessible":[152],"GitHub.":[154]},"counts_by_year":[{"year":2025,"cited_by_count":3}],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-10-10T00:00:00"}
