{"id":"https://openalex.org/W7140223649","doi":"https://doi.org/10.48550/arxiv.2603.21606","title":"mSFT: Addressing Dataset Mixtures Overfitting Heterogeneously in Multi-task SFT","display_name":"mSFT: Addressing Dataset Mixtures Overfitting Heterogeneously in Multi-task SFT","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140223649","doi":"https://doi.org/10.48550/arxiv.2603.21606"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.21606","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21606","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.21606","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Koh, Woosung","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Koh, Woosung","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Jeon, Jeyoung","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jeon, Jeyoung","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Song, Youngjin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Youngjin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cheon, Yujin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheon, Yujin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Oh, Soowon","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oh, Soowon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Choi, Jaehyeong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Choi, Jaehyeong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Yun, Se-Young","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yun, Se-Young","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.29789999127388,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.29789999127388,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.24879999458789825,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.11559999734163284,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/overfitting","display_name":"Overfitting","score":0.9244999885559082},{"id":"https://openalex.org/keywords/hyperparameter","display_name":"Hyperparameter","score":0.6025000214576721},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5148000121116638},{"id":"https://openalex.org/keywords/homogeneous","display_name":"Homogeneous","score":0.4648999869823456},{"id":"https://openalex.org/keywords/train","display_name":"Train","score":0.4449999928474426},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.4196999967098236}],"concepts":[{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.9244999885559082},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7164999842643738},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.621399998664856},{"id":"https://openalex.org/C8642999","wikidata":"https://www.wikidata.org/wiki/Q4171168","display_name":"Hyperparameter","level":2,"score":0.6025000214576721},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5533999800682068},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5148000121116638},{"id":"https://openalex.org/C66882249","wikidata":"https://www.wikidata.org/wiki/Q169336","display_name":"Homogeneous","level":2,"score":0.4648999869823456},{"id":"https://openalex.org/C190839683","wikidata":"https://www.wikidata.org/wiki/Q2448197","display_name":"Train","level":2,"score":0.4449999928474426},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.4196999967098236},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.4131999909877777},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.40130001306533813},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.36469998955726624},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.30070000886917114},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2800000011920929},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.275299996137619},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.26249998807907104}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.21606","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21606","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.21606","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21606","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Current":[0],"language":[1],"model":[2,55],"training":[3,126],"commonly":[4],"applies":[5],"multi-task":[6,49,136],"Supervised":[7],"Fine-Tuning":[8],"(SFT)":[9],"using":[10],"a":[11,131],"homogeneous":[12],"compute":[13,118],"budget":[14],"across":[15,85,99,144],"all":[16],"sub-datasets.":[17],"This":[18],"approach":[19],"is":[20,106],"fundamentally":[21],"sub-optimal:":[22],"heterogeneous":[23],"learning":[24],"dynamics":[25],"cause":[26],"faster-learning":[27],"tasks":[28],"to":[29,69,108],"overfit":[30],"early":[31],"while":[32,124],"slower":[33],"ones":[34],"remain":[35],"under-fitted.":[36],"To":[37],"address":[38],"this,":[39],"we":[40],"introduce":[41],"mSFT,":[42],"an":[43,57],"iterative,":[44],"overfitting-aware":[45,133],"search":[46],"algorithm":[47,134],"for":[48,135],"data":[50,146],"mixtures.":[51,147],"mSFT":[52,80,95,120,129],"trains":[53],"the":[54,63,140],"on":[56],"active":[58],"mixture,":[59],"identifies":[60],"and":[61,67,88,105],"excludes":[62],"earliest":[64],"overfitting":[65],"sub-dataset,":[66],"reverts":[68],"that":[70,79,138],"specific":[71],"optimal":[72],"checkpoint":[73],"before":[74],"continuing.":[75],"Extensive":[76],"evaluations":[77],"demonstrate":[78],"consistently":[81],"outperforms":[82],"4":[83],"baselines":[84],"10":[86],"benchmarks":[87],"6":[89],"base":[90],"models.":[91],"Further":[92],"analysis":[93],"confirms":[94],"maintains":[96],"robust":[97],"gains":[98],"diverse":[100,145],"dataset":[101],"sizes,":[102],"task":[103],"granularities,":[104],"insensitive":[107],"its":[109],"single":[110],"new":[111],"hyperparameter":[112],"(compute":[113],"budget).":[114],"Notably,":[115],"at":[116],"low":[117],"budget,":[119],"can":[121],"improve":[122],"performance":[123],"lowering":[125],"FLOPs.":[127],"Ultimately,":[128],"establishes":[130],"practical":[132],"SFT":[137],"maximizes":[139],"potential":[141],"of":[142],"models":[143]},"counts_by_year":[],"updated_date":"2026-03-26T06:05:38.182114","created_date":"2026-03-25T00:00:00"}
