{"id":"https://openalex.org/W7160999805","doi":"https://doi.org/10.48550/arxiv.2605.11231","title":"LiBaGS: Lightweight Boundary Gap Synthesis for Targeted Synthetic Data Selection","display_name":"LiBaGS: Lightweight Boundary Gap Synthesis for Targeted Synthetic Data Selection","publication_year":2026,"publication_date":"2026-05-11","ids":{"openalex":"https://openalex.org/W7160999805","doi":"https://doi.org/10.48550/arxiv.2605.11231"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.11231","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11231","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.11231","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015875550","display_name":"Abhishek Moturu","orcid":"https://orcid.org/0000-0003-0332-6324"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Moturu, Abhishek","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087983309","display_name":"Anna Goldenberg","orcid":"https://orcid.org/0000-0002-2416-833X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Goldenberg, Anna","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5011257199","display_name":"Babak Taati","orcid":"https://orcid.org/0000-0001-9763-4293"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Taati, Babak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.6115000247955322,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.6115000247955322,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.0812000036239624,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.05719999969005585,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.79339998960495},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.6765000224113464},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.47380000352859497},{"id":"https://openalex.org/keywords/missing-data","display_name":"Missing data","score":0.43689998984336853},{"id":"https://openalex.org/keywords/boundary","display_name":"Boundary (topology)","score":0.382099986076355},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3449000120162964},{"id":"https://openalex.org/keywords/real-world-data","display_name":"Real world data","score":0.34040001034736633}],"concepts":[{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.79339998960495},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.6765000224113464},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6195999979972839},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.47450000047683716},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.47380000352859497},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45840001106262207},{"id":"https://openalex.org/C9357733","wikidata":"https://www.wikidata.org/wiki/Q6878417","display_name":"Missing data","level":2,"score":0.43689998984336853},{"id":"https://openalex.org/C62354387","wikidata":"https://www.wikidata.org/wiki/Q875399","display_name":"Boundary (topology)","level":2,"score":0.382099986076355},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3553999960422516},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3449000120162964},{"id":"https://openalex.org/C3020493868","wikidata":"https://www.wikidata.org/wiki/Q55631277","display_name":"Real world data","level":2,"score":0.34040001034736633},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.31119999289512634},{"id":"https://openalex.org/C110121322","wikidata":"https://www.wikidata.org/wiki/Q865811","display_name":"Distribution (mathematics)","level":2,"score":0.30410000681877136},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.30090001225471497},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2919999957084656},{"id":"https://openalex.org/C2986800882","wikidata":"https://www.wikidata.org/wiki/Q7168187","display_name":"Performance enhancement","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C149441793","wikidata":"https://www.wikidata.org/wiki/Q200726","display_name":"Probability distribution","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.2630000114440918},{"id":"https://openalex.org/C138827492","wikidata":"https://www.wikidata.org/wiki/Q6661985","display_name":"Data processing","level":2,"score":0.25760000944137573},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.11231","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11231","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.11231","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11231","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Synthetic":[0],"data":[1,33,65,86],"is":[2],"useful":[3],"only":[4,89],"when":[5,97],"the":[6,13,19,63,90],"added":[7,103],"samples":[8,39,54,100],"fill":[9],"missing":[10],"parts":[11],"of":[12],"training":[14,32],"distribution":[15],"that":[16,52,74,127],"matter":[17],"for":[18,29],"downstream":[20],"task.":[21],"We":[22,67],"introduce":[23],"LiBaGS,":[24],"a":[25,70,105,117],"lightweight,":[26],"generator-agnostic":[27],"method":[28],"targeted":[30],"synthetic":[31,38,99],"selection.":[34],"LiBaGS":[35,94,128],"scores":[36],"candidate":[37],"by":[40],"combining":[41],"decision-boundary":[42,79],"proximity,":[43],"predictive":[44],"uncertainty,":[45],"real-data":[46],"density,":[47],"and":[48,58,115,137,140],"support":[49],"validity,":[50],"so":[51],"selected":[53],"are":[55],"both":[56],"informative":[57],"likely":[59],"to":[60,120],"remain":[61],"on":[62],"real":[64],"manifold.":[66],"then":[68],"use":[69],"boundary-gap":[71],"allocation":[72],"rule":[73],"targets":[75],"sparse":[76],"but":[77],"realistic":[78],"neighborhoods,":[80],"rather":[81],"than":[82],"simply":[83],"adding":[84],"more":[85],"or":[87],"selecting":[88],"most":[91],"uncertain":[92],"candidates.":[93],"also":[95],"learns":[96],"enough":[98],"have":[101],"been":[102],"through":[104],"marginal-value":[106],"stopping":[107],"rule,":[108],"assigns":[109],"softer":[110],"labels":[111],"near":[112],"ambiguous":[113],"boundaries,":[114],"uses":[116],"diversity":[118],"objective":[119],"avoid":[121],"redundant":[122],"near-duplicate":[123],"selections.":[124],"Experiments":[125],"show":[126],"improves":[129],"accuracy":[130],"over":[131],"classical":[132],"oversampling,":[133],"hard":[134],"augmentation,":[135],"uncertainty":[136],"density":[138],"ablations,":[139],"targeted-generation":[141],"selection":[142],"criteria.":[143]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-14T00:00:00"}
