{"id":"https://openalex.org/W6967362843","doi":"https://doi.org/10.48550/arxiv.2503.14499","title":"Measuring AI Ability to Complete Long Software Tasks","display_name":"Measuring AI Ability to Complete Long Software Tasks","publication_year":2025,"publication_date":"2025-03-18","ids":{"openalex":"https://openalex.org/W6967362843","doi":"https://doi.org/10.48550/arxiv.2503.14499"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2503.14499","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.14499","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2503.14499","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Kwa, Thomas","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kwa, Thomas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"West, Ben","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"West, Ben","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Becker, Joel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Becker, Joel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Deng, Amy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deng, Amy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Garcia, Katharyn","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Garcia, Katharyn","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Hasin, Max","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hasin, Max","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Jawhar, Sami","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jawhar, Sami","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Kinniment, Megan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kinniment, Megan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Rush, Nate","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rush, Nate","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Von Arx, Sydney","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Von Arx, Sydney","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Bloom, Ryan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bloom, Ryan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Broadley, Thomas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Broadley, Thomas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Du, Haoxing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Haoxing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Goodrich, Brian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Goodrich, Brian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Jurkovic, Nikola","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jurkovic, Nikola","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Miles, Luke Harold","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Miles, Luke Harold","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Nix, Seraphina","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nix, Seraphina","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lin, Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Tao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Parikh, Neev","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Parikh, Neev","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Rein, David","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rein, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Sato, Lucas Jun Koba","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sato, Lucas Jun Koba","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wijk, Hjalmar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wijk, Hjalmar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ziegler, Daniel M.","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ziegler, Daniel M.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Barnes, Elizabeth","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Barnes, Elizabeth","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Chan, Lawrence","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chan, Lawrence","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":25,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.38589999079704285,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.38589999079704285,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.1039000004529953,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.10119999945163727,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5425000190734863},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.48669999837875366},{"id":"https://openalex.org/keywords/extrapolation","display_name":"Extrapolation","score":0.4397999942302704},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.41819998621940613},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.40299999713897705},{"id":"https://openalex.org/keywords/time-horizon","display_name":"Time horizon","score":0.37950000166893005},{"id":"https://openalex.org/keywords/domain-knowledge","display_name":"Domain knowledge","score":0.3695000112056732}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7135999798774719},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5687999725341797},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5425000190734863},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.48669999837875366},{"id":"https://openalex.org/C132459708","wikidata":"https://www.wikidata.org/wiki/Q744069","display_name":"Extrapolation","level":2,"score":0.4397999942302704},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.41819998621940613},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4050000011920929},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.40299999713897705},{"id":"https://openalex.org/C28761237","wikidata":"https://www.wikidata.org/wiki/Q7805321","display_name":"Time horizon","level":2,"score":0.37950000166893005},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.3695000112056732},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.3463999927043915},{"id":"https://openalex.org/C2780876879","wikidata":"https://www.wikidata.org/wiki/Q3054749","display_name":"Meaning (existential)","level":2,"score":0.3190999925136566},{"id":"https://openalex.org/C149091818","wikidata":"https://www.wikidata.org/wiki/Q2429814","display_name":"Software system","level":3,"score":0.29840001463890076},{"id":"https://openalex.org/C159176650","wikidata":"https://www.wikidata.org/wiki/Q43261","display_name":"Horizon","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C117447612","wikidata":"https://www.wikidata.org/wiki/Q1412670","display_name":"Software quality","level":4,"score":0.257099986076355}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2503.14499","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.14499","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2503.14499","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.14499","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Despite":[0],"rapid":[1],"progress":[2],"on":[3,61],"AI":[4,19,45,77,95,118,186],"benchmarks,":[5],"the":[6,16,36,108,147,161],"real-world":[7,174],"meaning":[8],"of":[9,18,23,64,89,149,156,163,178,191],"benchmark":[10],"performance":[11],"remains":[12],"unclear.":[13],"To":[14],"quantify":[15],"capabilities":[17],"systems":[20,187],"in":[21,113,117],"terms":[22],"human":[24],"capabilities,":[25],"we":[26],"propose":[27],"a":[28,62,85,200],"new":[29],"metric:":[30],"50%-task-completion":[31],"time":[32,37,87,96,120],"horizon.":[33],"This":[34],"is":[35],"humans":[38,56,199],"typically":[39],"take":[40,198],"to":[41,123,132,134,173],"complete":[42,48],"tasks":[43,195],"that":[44,182,196],"models":[46,78],"can":[47],"with":[49,57,137],"50%":[50,86],"success":[51],"rate.":[52],"We":[53,145],"first":[54],"timed":[55],"relevant":[58],"domain":[59],"expertise":[60],"combination":[63],"RE-Bench,":[65],"HCAST,":[66],"and":[67,130,141,160],"66":[68],"novel":[69],"shorter":[70],"tasks.":[71],"On":[72],"these":[73,170],"tasks,":[74,176],"current":[75],"frontier":[76,94],"such":[79],"as":[80],"Claude":[81],"3.7":[82],"Sonnet":[83],"have":[84,111],"horizon":[88,97],"around":[90],"50":[91],"minutes.":[92],"Furthermore,":[93],"has":[98],"been":[99],"doubling":[100],"approximately":[101],"every":[102],"seven":[103],"months":[104],"since":[105],"2019,":[106],"though":[107],"trend":[109,180],"may":[110],"accelerated":[112],"2024.":[114],"The":[115],"increase":[116],"models'":[119],"horizons":[121],"seems":[122],"be":[124,189],"primarily":[125],"driven":[126],"by":[127],"greater":[128],"reliability":[129],"ability":[131],"adapt":[133],"mistakes,":[135],"combined":[136],"better":[138],"logical":[139],"reasoning":[140],"tool":[142],"use":[143],"capabilities.":[144,168],"discuss":[146],"limitations":[148],"our":[150],"results":[151,171],"--":[152,159],"including":[153],"their":[154],"degree":[155],"external":[157],"validity":[158],"implications":[162],"increased":[164],"autonomy":[165],"for":[166],"dangerous":[167],"If":[169],"generalize":[172],"software":[175,194],"extrapolation":[177],"this":[179],"predicts":[181],"within":[183],"5":[184],"years,":[185],"will":[188],"capable":[190],"automating":[192],"many":[193],"currently":[197],"month.":[201]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":8}],"updated_date":"2026-04-11T08:14:18.477133","created_date":"2025-10-10T00:00:00"}
