{"id":"https://openalex.org/W7140225170","doi":"https://doi.org/10.48550/arxiv.2603.22213","title":"SPA: A Simple but Tough-to-Beat Baseline for Knowledge Injection","display_name":"SPA: A Simple but Tough-to-Beat Baseline for Knowledge Injection","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140225170","doi":"https://doi.org/10.48550/arxiv.2603.22213"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.22213","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22213","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.22213","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Tang, Kexian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Kexian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Jiani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jiani","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Shaowen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shaowen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Lyu, Kaifeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Kaifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.41280001401901245,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.41280001401901245,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.09629999846220016,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.06610000133514404,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.8084999918937683},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.6514000296592712},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5719000101089478},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5374000072479248},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4966000020503998},{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.4650999903678894},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.42820000648498535}],"concepts":[{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.8084999918937683},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7160000205039978},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.6514000296592712},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5719000101089478},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5374000072479248},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4966000020503998},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4788999855518341},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.4650999903678894},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.42820000648498535},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42399999499320984},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.382099986076355},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.34049999713897705},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.337799996137619},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.30410000681877136},{"id":"https://openalex.org/C120567893","wikidata":"https://www.wikidata.org/wiki/Q1582085","display_name":"Knowledge extraction","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.28220000863075256},{"id":"https://openalex.org/C2781316041","wikidata":"https://www.wikidata.org/wiki/Q1230584","display_name":"Diversity (politics)","level":2,"score":0.2694999873638153}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.22213","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22213","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.22213","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22213","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"large":[1],"language":[2],"models":[3],"(LLMs)":[4],"are":[5],"pretrained":[6],"on":[7],"massive":[8],"amounts":[9],"of":[10,48,77,89],"data,":[11],"their":[12,118],"knowledge":[13,30,58,131],"coverage":[14],"remains":[15],"incomplete":[16],"in":[17,158],"specialized,":[18],"data-scarce":[19],"domains,":[20],"motivating":[21],"extensive":[22],"efforts":[23],"to":[24,52,105],"study":[25],"synthetic":[26,55],"data":[27,56,91,102],"generation":[28],"for":[29,57,130,155],"injection.":[31,59],"We":[32],"propose":[33],"SPA":[34,66,148],"(Scaling":[35],"Prompt-engineered":[36],"Augmentation),":[37],"a":[38,45,152],"simple":[39,115],"but":[40],"tough-to-beat":[41],"baseline":[42,154],"that":[43,65],"uses":[44],"small":[46,94],"set":[47],"carefully":[49],"designed":[50],"prompts":[51],"generate":[53],"large-scale":[54,139],"Through":[60],"systematic":[61],"comparisons,":[62],"we":[63,72,146],"find":[64],"outperforms":[67],"several":[68],"strong":[69,153],"baselines.":[70],"Furthermore,":[71],"identify":[73],"two":[74],"key":[75],"limitations":[76],"prior":[78],"approaches:":[79],"(1)":[80],"while":[81,110],"RL-based":[82],"methods":[83],"may":[84,113],"improve":[85],"the":[86],"token":[87],"efficiency":[88],"LLM-based":[90],"augmentation":[92,116,140],"at":[93,165],"scale,":[95],"they":[96],"suffer":[97],"from":[98],"diversity":[99],"collapse":[100],"as":[101,151],"scales,":[103],"leading":[104],"diminishing":[106],"returns;":[107],"and":[108,145],"(2)":[109],"multi-stage":[111],"prompting":[112],"outperform":[114],"methods,":[117],"advantages":[119],"can":[120,141,149],"disappear":[121],"after":[122],"careful":[123,133],"prompt":[124,134],"tuning.":[125],"Our":[126,161],"results":[127],"suggest":[128],"that,":[129],"injection,":[132],"design":[135],"combined":[136],"with":[137],"straightforward":[138],"be":[142],"surprisingly":[143],"effective,":[144],"hope":[147],"serve":[150],"future":[156],"studies":[157],"this":[159],"area.":[160],"code":[162],"is":[163],"available":[164],"https://github.com/Tangkexian/SPA.":[166]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-03-25T00:00:00"}
