{"id":"https://openalex.org/W7152534212","doi":"https://doi.org/10.48550/arxiv.2604.06742","title":"Evaluating LLM-Based 0-to-1 Software Generation in End-to-End CLI Tool Scenarios","display_name":"Evaluating LLM-Based 0-to-1 Software Generation in End-to-End CLI Tool Scenarios","publication_year":2026,"publication_date":"2026-04-08","ids":{"openalex":"https://openalex.org/W7152534212","doi":"https://doi.org/10.48550/arxiv.2604.06742"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.06742","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06742","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.06742","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133272099","display_name":"Ruida Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hu, Ruida","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133241957","display_name":"Xinchen Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xinchen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133299201","display_name":"Chao Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Chao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133261643","display_name":"Cuiyun Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Cuiyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133238359","display_name":"David Lo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lo, David","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5133272099"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.16609999537467957,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.16609999537467957,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.11869999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.05420000106096268,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.5756999850273132},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5162000060081482},{"id":"https://openalex.org/keywords/equivalence","display_name":"Equivalence (formal languages)","score":0.4595000147819519},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4537999927997589},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.4406999945640564},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.38749998807907104},{"id":"https://openalex.org/keywords/terminal","display_name":"Terminal (telecommunication)","score":0.3856000006198883}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7466999888420105},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.5756999850273132},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5162000060081482},{"id":"https://openalex.org/C2780069185","wikidata":"https://www.wikidata.org/wiki/Q7977945","display_name":"Equivalence (formal languages)","level":2,"score":0.4595000147819519},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4537999927997589},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.4406999945640564},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.38749998807907104},{"id":"https://openalex.org/C2779664074","wikidata":"https://www.wikidata.org/wiki/Q3518405","display_name":"Terminal (telecommunication)","level":2,"score":0.3856000006198883},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.37560001015663147},{"id":"https://openalex.org/C149091818","wikidata":"https://www.wikidata.org/wiki/Q2429814","display_name":"Software system","level":3,"score":0.33090001344680786},{"id":"https://openalex.org/C93226319","wikidata":"https://www.wikidata.org/wiki/Q193137","display_name":"Differential (mechanical device)","level":2,"score":0.3100000023841858},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.29980000853538513},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2856999933719635},{"id":"https://openalex.org/C529173508","wikidata":"https://www.wikidata.org/wiki/Q638608","display_name":"Software development","level":3,"score":0.28299999237060547},{"id":"https://openalex.org/C174683762","wikidata":"https://www.wikidata.org/wiki/Q609588","display_name":"Component-based software engineering","level":4,"score":0.27239999175071716},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.2597000002861023},{"id":"https://openalex.org/C178059732","wikidata":"https://www.wikidata.org/wiki/Q1982529","display_name":"Software performance testing","level":5,"score":0.25369998812675476}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.06742","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06742","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.06742","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06742","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"are":[4],"driving":[5],"a":[6,58,79],"shift":[7],"towards":[8],"intent-driven":[9],"development,":[10],"where":[11],"agents":[12,134],"build":[13],"complete":[14],"software":[15,85],"from":[16],"scratch.":[17],"However,":[18],"existing":[19],"benchmarks":[20],"fail":[21],"to":[22,29,136],"assess":[23],"this":[24,53],"0-to-1":[25,122],"generation":[26,65],"capability":[27],"due":[28],"two":[30],"limitations:":[31],"reliance":[32],"on":[33],"predefined":[34],"scaffolds":[35],"that":[36,46,110],"ignore":[37],"repository":[38],"structure":[39],"planning,":[40],"and":[41,94,133],"rigid":[42],"white-box":[43],"unit":[44],"testing":[45,82],"lacks":[47],"end-to-end":[48],"behavioral":[49],"validation.":[50],"To":[51],"bridge":[52],"gap,":[54],"we":[55,108],"introduce":[56],"CLI-Tool-Bench,":[57],"structure-agnostic":[59],"benchmark":[60],"for":[61],"evaluating":[62],"the":[63,118],"ground-up":[64],"of":[66,121],"Command-Line":[67],"Interface":[68],"(CLI)":[69],"tools.":[70],"It":[71],"features":[72],"100":[73],"diverse":[74],"real-world":[75],"repositories":[76],"evaluated":[77],"via":[78],"black-box":[80],"differential":[81],"framework.":[83],"Agent-generated":[84],"is":[86],"executed":[87],"in":[88],"sandboxes,":[89],"comparing":[90],"system":[91],"side":[92],"effects":[93],"terminal":[95],"outputs":[96],"against":[97],"human-written":[98],"oracles":[99],"using":[100],"multi-tiered":[101],"equivalence":[102],"metrics.":[103],"Evaluating":[104],"seven":[105],"state-of-the-art":[106],"LLMs,":[107],"reveal":[109],"top":[111],"models":[112],"achieve":[113],"under":[114],"43%":[115],"success,":[116],"highlighting":[117],"ongoing":[119],"challenge":[120],"generation.":[123],"Furthermore,":[124],"higher":[125],"token":[126],"consumption":[127],"does":[128],"not":[129],"guarantee":[130],"better":[131],"performance,":[132],"tend":[135],"generate":[137],"monolithic":[138],"code.":[139]},"counts_by_year":[],"updated_date":"2026-04-10T06:07:51.998497","created_date":"2026-04-10T00:00:00"}
