{"id":"https://openalex.org/W7138458161","doi":"https://doi.org/10.1609/aaai.v40i37.40437","title":"OSVBench: Benchmarking LLMs on Specification Generation Tasks for Operating System Verification","display_name":"OSVBench: Benchmarking LLMs on Specification Generation Tasks for Operating System Verification","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138458161","doi":"https://doi.org/10.1609/aaai.v40i37.40437"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i37.40437","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i37.40437","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i37.40437","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129662045","display_name":"Shangyu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shangyu Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007147042","display_name":"Juyong Jiang","orcid":"https://orcid.org/0000-0003-0835-9686"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Juyong Jiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114246802","display_name":"Tiancheng Zhao","orcid":"https://orcid.org/0000-0002-7443-0666"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tiancheng Zhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5024117197","display_name":"Jiasi Shen","orcid":"https://orcid.org/0000-0002-5904-3641"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiasi Shen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18181818,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"37","first_page":"31698","last_page":"31707"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.1809999942779541,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.1809999942779541,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10639","display_name":"Advanced Software Engineering Methodologies","score":0.15109999477863312,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.1467999964952469,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7095999717712402},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.625},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.5760999917984009},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.5497999787330627},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4016000032424927},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3806999921798706}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7300000190734863},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7095999717712402},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.625},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.5760999917984009},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.5497999787330627},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.5264999866485596},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4016000032424927},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3806999921798706},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.3517000079154968},{"id":"https://openalex.org/C116253237","wikidata":"https://www.wikidata.org/wiki/Q1437424","display_name":"Formal specification","level":2,"score":0.35030001401901245},{"id":"https://openalex.org/C60048249","wikidata":"https://www.wikidata.org/wiki/Q37437","display_name":"Syntax","level":2,"score":0.3199999928474426},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.3003999888896942},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.2702000141143799},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.25600001215934753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i37.40437","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i37.40437","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i37.40437","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i37.40437","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"introduce":[1],"OSVBench,":[2],"a":[3,34,54,69,75,89,133,144],"new":[4],"benchmark":[5,30,62],"for":[6,20,77,111,143,170],"evaluating":[7],"Large":[8],"Language":[9],"Models":[10],"(LLMs)":[11],"on":[12,165],"the":[13,22,64,99,107,121,127,150,166],"task":[14,56,67,169],"of":[15,25,42,51,57,149,162],"generating":[16,116],"complete":[17],"formal":[18,117],"specifications":[19],"verifying":[21],"functional":[23,125],"correctness":[24],"operating":[26,36,122,151,171],"system":[27,37,172],"kernels.":[28],"This":[29,82],"is":[31,53,84],"built":[32],"upon":[33],"real-world":[35],"kernel,":[38],"Hyperkernel,":[39],"and":[40,80,102,113,115,141],"consists":[41],"245":[43],"complex":[44],"specification":[45,65,134,167],"generation":[46,66,168,188],"tasks":[47],"in":[48,176,181],"total,":[49],"each":[50],"which":[52],"long-context":[55,186],"about":[58],"20k-30k":[59],"tokens.":[60],"The":[61,92],"formulates":[63],"as":[68],"program":[70],"synthesis":[71],"problem":[72],"confined":[73],"to":[74,86,97,131,184],"domain":[76],"specifying":[78],"states":[79,140],"transitions.":[81],"formulation":[83],"provided":[85],"LLMs":[87,93,128,158,164],"through":[88],"programming":[90,100],"model.":[91],"must":[94],"be":[95],"able":[96],"understand":[98],"model":[101],"verification":[103],"assumptions":[104],"before":[105],"delineating":[106],"correct":[108,139],"search":[109],"space":[110],"syntax":[112],"semantics":[114],"specifications.":[118],"Guided":[119],"by":[120],"system's":[123],"high-level":[124],"description,":[126],"are":[129],"asked":[130],"generate":[132],"that":[135],"fully":[136],"describes":[137],"all":[138],"transitions":[142],"potentially":[145],"buggy":[146],"code":[147,187],"implementation":[148],"system.":[152],"Experimental":[153],"results":[154],"with":[155],"12":[156],"state-of-the-art":[157],"indicate":[159],"limited":[160],"performance":[161,178],"existing":[163],"verification.":[173],"Significant":[174],"disparities":[175],"their":[177,182],"highlight":[179],"differences":[180],"ability":[183],"handle":[185],"tasks.":[189]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-18T00:00:00"}
