{"id":"https://openalex.org/W4402705130","doi":"https://doi.org/10.48550/arxiv.2408.15079","title":"BaichuanSEED: Sharing the Potential of ExtensivE Data Collection and Deduplication by Introducing a Competitive Large Language Model Baseline","display_name":"BaichuanSEED: Sharing the Potential of ExtensivE Data Collection and Deduplication by Introducing a Competitive Large Language Model Baseline","publication_year":2024,"publication_date":"2024-08-27","ids":{"openalex":"https://openalex.org/W4402705130","doi":"https://doi.org/10.48550/arxiv.2408.15079"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2408.15079","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.15079","pdf_url":"https://arxiv.org/pdf/2408.15079","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2408.15079","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112223588","display_name":"Guosheng Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Dong, Guosheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004427460","display_name":"Da Pan","orcid":"https://orcid.org/0000-0002-1618-7389"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Da","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108232154","display_name":"Yiding Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Yiding","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089071765","display_name":"Shusen Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shusen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liang, Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Zheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107559435","display_name":"Xin Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043264044","display_name":"Yanjun Shen","orcid":"https://orcid.org/0000-0001-8109-4819"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Yanjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058023767","display_name":"Fan Yang","orcid":"https://orcid.org/0000-0002-0365-710X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Fan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111348348","display_name":"Haoze Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Haoze","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054795082","display_name":"Tianpeng Li","orcid":"https://orcid.org/0000-0002-6665-1151"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Tianpeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102576702","display_name":"Mingan Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Mingan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111335403","display_name":"Jianhua Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Jianhua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110839622","display_name":"Yufan Zhang","orcid":"https://orcid.org/0000-0002-7114-3485"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yufan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111345729","display_name":"Xiaonan Nie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nie, Xiaonan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001046274","display_name":"Lei Su","orcid":"https://orcid.org/0000-0002-7894-7881"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Lei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058596820","display_name":"Bingning Wang","orcid":"https://orcid.org/0000-0003-4095-5082"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Bingning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111196099","display_name":"Wentao Zhang","orcid":"https://orcid.org/0009-0003-7828-6096"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Wentao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072119199","display_name":"Jiaxin Mao","orcid":"https://orcid.org/0000-0002-9257-5498"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mao, Jiaxin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086811972","display_name":"Zenan Zhou","orcid":"https://orcid.org/0000-0001-7654-9145"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zenan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5077228516","display_name":"Weipeng Chen","orcid":"https://orcid.org/0009-0003-8811-2719"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Weipeng","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":20,"corresponding_author_ids":["https://openalex.org/A5112223588"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9160000085830688,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9160000085830688,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/data-deduplication","display_name":"Data deduplication","score":0.8940132856369019},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.8835998177528381},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.681630551815033},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.39009618759155273},{"id":"https://openalex.org/keywords/geology","display_name":"Geology","score":0.08523207902908325}],"concepts":[{"id":"https://openalex.org/C32587265","wikidata":"https://www.wikidata.org/wiki/Q1182260","display_name":"Data deduplication","level":2,"score":0.8940132856369019},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.8835998177528381},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.681630551815033},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.39009618759155273},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.08523207902908325},{"id":"https://openalex.org/C111368507","wikidata":"https://www.wikidata.org/wiki/Q43518","display_name":"Oceanography","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2408.15079","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.15079","pdf_url":"https://arxiv.org/pdf/2408.15079","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2408.15079","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2408.15079","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2408.15079","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.15079","pdf_url":"https://arxiv.org/pdf/2408.15079","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4402705130.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W3144870715","https://openalex.org/W3142319788","https://openalex.org/W2587188779","https://openalex.org/W3132870970","https://openalex.org/W2943088381","https://openalex.org/W4385804830","https://openalex.org/W2144348063","https://openalex.org/W2074021203"],"abstract_inverted_index":{"The":[0],"general":[1],"capabilities":[2],"of":[3,34,59,138],"Large":[4],"Language":[5],"Models":[6],"(LLM)":[7],"highly":[8],"rely":[9],"on":[10,15,110],"the":[11,32,54,133],"composition":[12],"and":[13,41,45,65,102,106,123,144],"selection":[14],"extensive":[16],"pretraining":[17],"datasets,":[18],"treated":[19],"as":[20,121,142],"commercial":[21,115],"secrets":[22],"by":[23,47,81,91],"several":[24,114,128],"institutions.":[25],"To":[26],"mitigate":[27],"this":[28],"issue,":[29],"we":[30],"open-source":[31],"details":[33],"a":[35,49,73],"universally":[36],"applicable":[37],"data":[38,55],"processing":[39,56],"pipeline":[40,57,83],"validate":[42],"its":[43],"effectiveness":[44],"potential":[46,134],"introducing":[48],"competitive":[50],"LLM":[51],"baseline.":[52],"Specifically,":[53],"consists":[58],"broad":[60],"collection":[61],"to":[62,67,131],"scale":[63],"up":[64],"reweighting":[66],"improve":[68],"quality.":[69],"We":[70,125],"then":[71],"pretrain":[72],"7B":[74],"model":[75],"BaichuanSEED":[76,99],"with":[77,113],"3T":[78],"tokens":[79],"processed":[80],"our":[82],"without":[84],"any":[85],"deliberate":[86],"downstream":[87,139],"task-related":[88],"optimization,":[89],"followed":[90],"an":[92],"easy":[93],"but":[94],"effective":[95],"supervised":[96],"fine-tuning":[97],"stage.":[98],"demonstrates":[100],"consistency":[101],"predictability":[103],"throughout":[104],"training":[105],"achieves":[107],"comparable":[108],"performance":[109],"comprehensive":[111],"benchmarks":[112],"advanced":[116],"large":[117],"language":[118],"models,":[119],"such":[120,141],"Qwen1.5":[122],"Llama3.":[124],"also":[126],"conduct":[127],"heuristic":[129],"experiments":[130],"discuss":[132],"for":[135],"further":[136],"optimization":[137],"tasks,":[140],"mathematics":[143],"coding.":[145]},"counts_by_year":[],"updated_date":"2026-04-16T08:26:57.006410","created_date":"2024-09-21T00:00:00"}
