{"id":"https://openalex.org/W4405089011","doi":"https://doi.org/10.48550/arxiv.2412.02980","title":"Surveying the Effects of Quality, Diversity, and Complexity in Synthetic Data From Large Language Models","display_name":"Surveying the Effects of Quality, Diversity, and Complexity in Synthetic Data From Large Language Models","publication_year":2024,"publication_date":"2024-12-04","ids":{"openalex":"https://openalex.org/W4405089011","doi":"https://doi.org/10.48550/arxiv.2412.02980"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2412.02980","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.02980","pdf_url":"https://arxiv.org/pdf/2412.02980","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.02980","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072189170","display_name":"Alex Havrilla","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Havrilla, Alex","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101597225","display_name":"Andrew M. Dai","orcid":"https://orcid.org/0009-0007-9200-8577"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Andrew","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086509763","display_name":"Laura O'Mahony","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"O'Mahony, Laura","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058203980","display_name":"Koen Oostermeijer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oostermeijer, Koen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115021344","display_name":"Vera Zisler","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zisler, Vera","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090026363","display_name":"Alon Albalak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Albalak, Alon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115021345","display_name":"Fabrizio Milo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Milo, Fabrizio","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040523178","display_name":"Sharath Chandra Raparthy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Raparthy, Sharath Chandra","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085625870","display_name":"Kanishk Gandhi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gandhi, Kanishk","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5098824372","display_name":"Baber Abbasi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abbasi, Baber","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003505678","display_name":"Duy Phung","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Phung, Duy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021345936","display_name":"Maia Iyer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Iyer, Maia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5094363682","display_name":"Dakota Mahan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mahan, Dakota","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Blagden, Chase","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Blagden, Chase","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115021343","display_name":"Srishti Gureja","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gureja, Srishti","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053361626","display_name":"Mohammed Hamdy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hamdy, Mohammed","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101583301","display_name":"Weizhong Li","orcid":"https://orcid.org/0000-0002-9003-7733"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Wen-Ding","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057872569","display_name":"Giovanni Paolini","orcid":"https://orcid.org/0000-0002-4309-281X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Paolini, Giovanni","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087801436","display_name":"Pawan Sasanka Ammanamanchi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ammanamanchi, Pawan Sasanka","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5020821744","display_name":"Elliot Meyerson","orcid":"https://orcid.org/0000-0002-1871-2757"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meyerson, Elliot","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":20,"corresponding_author_ids":["https://openalex.org/A5072189170"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9625999927520752,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9625999927520752,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9593999981880188,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/diversity","display_name":"Diversity (politics)","score":0.7165104150772095},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5465096235275269},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.47348034381866455},{"id":"https://openalex.org/keywords/data-quality","display_name":"Data quality","score":0.4463016986846924},{"id":"https://openalex.org/keywords/econometrics","display_name":"Econometrics","score":0.3452904522418976},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3305957317352295},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.25982141494750977},{"id":"https://openalex.org/keywords/political-science","display_name":"Political science","score":0.1749083399772644},{"id":"https://openalex.org/keywords/economics","display_name":"Economics","score":0.1478641927242279},{"id":"https://openalex.org/keywords/epistemology","display_name":"Epistemology","score":0.11177653074264526},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.08294188976287842},{"id":"https://openalex.org/keywords/operations-management","display_name":"Operations management","score":0.07015484571456909}],"concepts":[{"id":"https://openalex.org/C2781316041","wikidata":"https://www.wikidata.org/wiki/Q1230584","display_name":"Diversity (politics)","level":2,"score":0.7165104150772095},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5465096235275269},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.47348034381866455},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.4463016986846924},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.3452904522418976},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3305957317352295},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.25982141494750977},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.1749083399772644},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.1478641927242279},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.11177653074264526},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.08294188976287842},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.07015484571456909},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2412.02980","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.02980","pdf_url":"https://arxiv.org/pdf/2412.02980","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.02980","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.02980","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.02980","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.02980","pdf_url":"https://arxiv.org/pdf/2412.02980","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4405089011.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4236583520","https://openalex.org/W2322261865","https://openalex.org/W4398861705","https://openalex.org/W2734736160","https://openalex.org/W2409468626","https://openalex.org/W2137479650","https://openalex.org/W2245479382","https://openalex.org/W3139296374","https://openalex.org/W4387803644","https://openalex.org/W4210350690"],"abstract_inverted_index":{"Synthetic":[0],"data":[1,14,29,57,65,126,145,149,160,173,190],"generation":[2,30,161],"with":[3],"Large":[4],"Language":[5],"Models":[6],"is":[7,253],"a":[8,16,180,264],"promising":[9],"paradigm":[10],"for":[11,75,98,106,114,191,235,245],"augmenting":[12],"natural":[13],"over":[15],"nearly":[17],"infinite":[18],"range":[19],"of":[20,55,64,89,121,139,185,221,258,266],"tasks.":[21],"Given":[22],"this":[23,271],"variety,":[24],"direct":[25],"comparisons":[26],"among":[27],"synthetic":[28,56,144,159,189,222],"algorithms":[31,51,162,261],"are":[32,229],"scarce,":[33],"making":[34,268],"it":[35],"difficult":[36],"to":[37,49,95,103,111,155,199,255],"understand":[38],"where":[39],"improvement":[40],"comes":[41],"from":[42],"and":[43,68,81,109,127,157,168,195,214,232,242,262],"what":[44],"bottlenecks":[45],"exist.":[46],"We":[47,70,92,134,224,247],"propose":[48],"evaluate":[50],"via":[52],"the":[53,82,87,119,128,137,143,164,169,183,200,219,243,256],"makeup":[54],"generated":[58],"by":[59],"each":[60,84,148],"algorithm":[61],"in":[62,78,124,142,188,203,270],"terms":[63],"quality,":[66,237],"diversity,":[67],"complexity.":[69],"choose":[71],"these":[72,251],"three":[73],"characteristics":[74],"their":[76],"significance":[77],"open-ended":[79],"processes":[80],"impact":[83,218],"has":[85],"on":[86,131,147,172,182],"capabilities":[88],"downstream":[90,129],"models.":[91],"find":[93],"quality":[94,213],"be":[96,104,112],"essential":[97,105,254],"in-distribution":[99],"model":[100,132,211],"generalization,":[101,108],"diversity":[102,216,241],"out-of-distribution":[107],"complexity":[110],"beneficial":[113],"both.":[115],"Further,":[116],"we":[117],"emphasize":[118],"existence":[120],"Quality-Diversity":[122],"trade-offs":[123,202,209,252],"training":[125,204],"effects":[130,171],"performance.":[133],"then":[135],"examine":[136],"effect":[138],"various":[140],"components":[141,165],"pipeline":[146],"characteristic.":[150],"This":[151,176],"examination":[152],"allows":[153],"us":[154],"taxonomize":[156],"compare":[158],"through":[163],"they":[166],"utilize":[167],"resulting":[170],"QDC":[174,187],"composition.":[175],"analysis":[177],"extends":[178],"into":[179],"discussion":[181],"importance":[184],"balancing":[186,250],"efficient":[192],"reinforcement":[193],"learning":[194],"self-improvement":[196,260],"algorithms.":[197],"Analogous":[198],"QD":[201],"data,":[205],"often":[206],"there":[207],"exist":[208],"between":[210],"output":[212,215,236,240],"which":[217],"composition":[220],"data.":[223],"observe":[225],"that":[226,249],"many":[227],"models":[228],"currently":[230],"evaluated":[231],"optimized":[233],"only":[234],"thereby":[238],"limiting":[239],"potential":[244],"self-improvement.":[246],"argue":[248],"development":[257],"future":[259],"highlight":[263],"number":[265],"works":[267],"progress":[269],"direction.":[272]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3}],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2024-12-06T00:00:00"}
