{"id":"https://openalex.org/W4403362766","doi":"https://doi.org/10.48550/arxiv.2408.03617","title":"Is Child-Directed Speech Effective Training Data for Language Models?","display_name":"Is Child-Directed Speech Effective Training Data for Language Models?","publication_year":2024,"publication_date":"2024-08-07","ids":{"openalex":"https://openalex.org/W4403362766","doi":"https://doi.org/10.48550/arxiv.2408.03617"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2408.03617","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.03617","pdf_url":"https://arxiv.org/pdf/2408.03617","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2408.03617","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5039277252","display_name":"Steven Y. Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Feng, Steven Y.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001961716","display_name":"Noah D. Goodman","orcid":"https://orcid.org/0000-0002-9176-8802"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Goodman, Noah D.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5103234894","display_name":"Michael C. Frank","orcid":"https://orcid.org/0000-0002-7551-4378"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Frank, Michael C.","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5039277252"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9860000014305115,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9860000014305115,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9764000177383423,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9732000231742859,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5929865837097168},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5861053466796875},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.4204692244529724},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4130095839500427},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.37444326281547546},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3414493501186371},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.3338688015937805},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.27496790885925293},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.09046798944473267}],"concepts":[{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5929865837097168},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5861053466796875},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.4204692244529724},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4130095839500427},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.37444326281547546},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3414493501186371},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.3338688015937805},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27496790885925293},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.09046798944473267},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2408.03617","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.03617","pdf_url":"https://arxiv.org/pdf/2408.03617","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2408.03617","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2408.03617","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2408.03617","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.03617","pdf_url":"https://arxiv.org/pdf/2408.03617","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1316833246","display_name":null,"funder_award_id":"PGS D","funder_id":"https://openalex.org/F4320334593","funder_display_name":"Natural Sciences and Engineering Research Council of Canada"},{"id":"https://openalex.org/G7089305364","display_name":null,"funder_award_id":"Doctoral","funder_id":"https://openalex.org/F4320334593","funder_display_name":"Natural Sciences and Engineering Research Council of Canada"}],"funders":[{"id":"https://openalex.org/F4320334593","display_name":"Natural Sciences and Engineering Research Council of Canada","ror":"https://ror.org/01h531d29"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4403362766.pdf","grobid_xml":"https://content.openalex.org/works/W4403362766.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W230091440","https://openalex.org/W2233261550","https://openalex.org/W2810751659","https://openalex.org/W258997015","https://openalex.org/W2997094352","https://openalex.org/W3216976533","https://openalex.org/W100620283","https://openalex.org/W2495260952","https://openalex.org/W4394050964","https://openalex.org/W2551249631"],"abstract_inverted_index":{"While":[0],"high-performing":[1],"language":[2,17,41,138,146,170],"models":[3,53,91],"are":[4,27],"typically":[5],"trained":[6],"on":[7,54],"hundreds":[8],"of":[9,11,24,30,57,76,89,110,124],"billions":[10],"words,":[12],"human":[13],"children":[14],"become":[15],"fluent":[16],"users":[18],"with":[19],"a":[20,62,73],"much":[21],"smaller":[22],"amount":[23],"data.":[25],"What":[26],"the":[28,31,79,84,101,106,125,151,160],"features":[29,39],"data":[32,113,126],"they":[33],"receive,":[34],"and":[35,51,61,72,86],"how":[36],"do":[37,134],"these":[38,90],"support":[40,150],"modeling":[42,171],"objectives?":[43],"To":[44],"investigate":[45],"this":[46],"question,":[47],"we":[48,98],"train":[49],"GPT-2":[50],"RoBERTa":[52],"29M":[55],"words":[56],"English":[58],"child-directed":[59],"speech":[60],"new":[63],"matched,":[64],"synthetic":[65],"dataset":[66],"(TinyDialogues),":[67],"comparing":[68],"to":[69,118],"OpenSubtitles,":[70],"Wikipedia,":[71],"heterogeneous":[74],"blend":[75],"datasets":[77],"from":[78,157],"BabyLM":[80],"challenge.":[81],"We":[82],"evaluate":[83],"syntactic":[85],"semantic":[87],"knowledge":[88],"using":[92],"developmentally-inspired":[93],"evaluations.":[94],"Through":[95],"pretraining":[96],"experiments,":[97],"test":[99],"whether":[100],"global":[102,132],"developmental":[103],"ordering":[104,109],"or":[105],"local":[107,122],"discourse":[108],"children's":[111],"training":[112,145],"supports":[114],"high":[115],"performance":[116],"relative":[117],"other":[119],"datasets.":[120],"The":[121],"properties":[123,133],"affect":[127],"model":[128],"results,":[129],"but":[130],"surprisingly,":[131],"not.":[135],"Further,":[136],"child":[137],"input":[139],"is":[140,164],"not":[141],"uniquely":[142],"valuable":[143],"for":[144],"models.":[147],"These":[148],"findings":[149],"hypothesis":[152],"that,":[153],"rather":[154],"than":[155,168],"proceeding":[156],"better":[158],"data,":[159],"child's":[161],"learning":[162],"algorithm":[163],"substantially":[165],"more":[166],"data-efficient":[167],"current":[169],"techniques.":[172]},"counts_by_year":[],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
