{"id":"https://openalex.org/W7138195712","doi":"https://doi.org/10.1609/aaai.v40i37.40444","title":"Selection of LLM Fine-Tuning Data Based on Orthogonal Rules","display_name":"Selection of LLM Fine-Tuning Data Based on Orthogonal Rules","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138195712","doi":"https://doi.org/10.1609/aaai.v40i37.40444"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i37.40444","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i37.40444","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i37.40444","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129745659","display_name":"Xiaomin Li","orcid":null},"institutions":[{"id":"https://openalex.org/I2801851002","display_name":"Harvard University Press","ror":"https://ror.org/006v7bf86","country_code":"US","type":"other","lineage":["https://openalex.org/I136199984","https://openalex.org/I2801851002"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Xiaomin Li","raw_affiliation_strings":["Harvard University"],"affiliations":[{"raw_affiliation_string":"Harvard University","institution_ids":["https://openalex.org/I2801851002"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129679786","display_name":"Mingye Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mingye Gao","raw_affiliation_strings":["Massachusetts Institute of Technology"],"affiliations":[{"raw_affiliation_string":"Massachusetts Institute of Technology","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129740440","display_name":"Zhiwei Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhiwei Zhang","raw_affiliation_strings":["Pennsylvania State University"],"affiliations":[{"raw_affiliation_string":"Pennsylvania State University","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109799726","display_name":"Chang Yue","orcid":null},"institutions":[{"id":"https://openalex.org/I20089843","display_name":"Princeton University","ror":"https://ror.org/00hx57361","country_code":"US","type":"education","lineage":["https://openalex.org/I20089843"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chang Yue","raw_affiliation_strings":["Princeton University"],"affiliations":[{"raw_affiliation_string":"Princeton University","institution_ids":["https://openalex.org/I20089843"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129661695","display_name":"Hong Hu","orcid":null},"institutions":[{"id":"https://openalex.org/I204465549","display_name":"Washington University in St. Louis","ror":"https://ror.org/01yc7t268","country_code":"US","type":"education","lineage":["https://openalex.org/I204465549"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hong Hu","raw_affiliation_strings":["Washington University in St. Louis"],"affiliations":[{"raw_affiliation_string":"Washington University in St. Louis","institution_ids":["https://openalex.org/I204465549"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5129745659"],"corresponding_institution_ids":["https://openalex.org/I2801851002"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.41595289,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"37","first_page":"31760","last_page":"31768"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.43380001187324524,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.43380001187324524,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.1623000055551529,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.0917000025510788,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/orthogonality","display_name":"Orthogonality","score":0.685699999332428},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.6732000112533569},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.6152999997138977},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.59170001745224},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5759000182151794},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5123999714851379},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.4645000100135803},{"id":"https://openalex.org/keywords/downstream","display_name":"Downstream (manufacturing)","score":0.453900009393692}],"concepts":[{"id":"https://openalex.org/C17137986","wikidata":"https://www.wikidata.org/wiki/Q215067","display_name":"Orthogonality","level":2,"score":0.685699999332428},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.6732000112533569},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6474000215530396},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.6152999997138977},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.59170001745224},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5759000182151794},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5444999933242798},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5123999714851379},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5109000205993652},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4796000123023987},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.4645000100135803},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.453900009393692},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.38679999113082886},{"id":"https://openalex.org/C2780898871","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Performance metric","level":2,"score":0.3813999891281128},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.36039999127388},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.30219998955726624},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2904999852180481},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.28999999165534973},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C3020493868","wikidata":"https://www.wikidata.org/wiki/Q55631277","display_name":"Real world data","level":2,"score":0.27970001101493835},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.26260000467300415},{"id":"https://openalex.org/C5274069","wikidata":"https://www.wikidata.org/wiki/Q2285707","display_name":"Categorical variable","level":2,"score":0.2597000002861023}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i37.40444","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i37.40444","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i37.40444","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i37.40444","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.7317124605178833,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"High-quality":[0],"training":[1],"data":[2,23,58,93],"is":[3],"critical":[4],"to":[5,19,50,73,85,99,109,120],"the":[6,67,104,111,122,157],"performance":[7,152,182],"of":[8,29,69,92,153],"large":[9],"language":[10],"models":[11],"(LLMs).":[12],"Recent":[13],"work":[14],"has":[15],"explored":[16],"using":[17],"LLMs":[18,84,154],"rate":[20],"and":[21,47,75,102,125,150,165,179],"select":[22,76,110],"based":[24,65],"on":[25,39,66,156],"a":[26,55,63],"small":[27],"set":[28],"human-designed":[30],"criteria":[31],"(rules),":[32],"but":[33],"these":[34,100],"approaches":[35],"often":[36],"rely":[37],"heavily":[38],"heuristics,":[40],"lack":[41],"principled":[42],"metrics":[43],"for":[44,130],"rule":[45,70,172],"evaluation,":[46],"generalize":[48],"poorly":[49],"new":[51],"tasks.":[52],"We":[53,137],"propose":[54],"novel":[56],"rule-based":[57],"selection":[59,173],"framework":[60,140],"that":[61,169],"introduces":[62],"metric":[64],"orthogonality":[68],"score":[71,121],"vectors":[72],"evaluate":[74,138],"complementary":[77],"rules.":[78,114],"Our":[79],"automated":[80],"pipeline":[81],"first":[82],"uses":[83],"generate":[86],"diverse":[87],"rules":[88,101,116],"covering":[89],"multiple":[90],"aspects":[91],"quality,":[94],"then":[95,118],"rates":[96],"samples":[97,127],"according":[98],"applies":[103],"determinantal":[105],"point":[106],"process":[107],"(DPP)":[108],"most":[112],"independent":[113],"These":[115],"are":[117,128],"used":[119],"full":[123],"dataset,":[124],"high-scoring":[126],"selected":[129,158],"downstream":[131,180],"tasks":[132],"such":[133],"as":[134],"LLM":[135],"fine-tuning.":[136],"our":[139,170],"in":[141],"two":[142],"experiment":[143],"setups:":[144],"(1)":[145],"alignment":[146],"with":[147],"ground-truth":[148],"ratings":[149],"(2)":[151],"fine-tuned":[155],"data.":[159],"Experiments":[160],"across":[161],"IMDB,":[162],"Medical,":[163],"Math,":[164],"Code":[166],"domains":[167],"demonstrate":[168],"DPP-based":[171],"consistently":[174],"improves":[175],"both":[176],"rating":[177],"accuracy":[178],"model":[181],"over":[183],"strong":[184],"baselines.":[185]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
