{"id":"https://openalex.org/W7160837719","doi":"https://doi.org/10.48550/arxiv.2605.07001","title":"SmellBench: Evaluating LLM Agents on Architectural Code Smell Repair","display_name":"SmellBench: Evaluating LLM Agents on Architectural Code Smell Repair","publication_year":2026,"publication_date":"2026-05-07","ids":{"openalex":"https://openalex.org/W7160837719","doi":"https://doi.org/10.48550/arxiv.2605.07001"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.07001","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.07001","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.07001","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106807925","display_name":"Ion George Dinu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dinu, Ion George","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037824821","display_name":"Marian Cristian Mih\u0103escu","orcid":"https://orcid.org/0000-0003-0350-0441"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mih\u0103escu, Marian Cristian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5108156431","display_name":"Traian Rebedea","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rebedea, Traian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9629999995231628,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9629999995231628,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.01549999974668026,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.004100000020116568,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codebase","display_name":"Codebase","score":0.902899980545044},{"id":"https://openalex.org/keywords/code-smell","display_name":"Code smell","score":0.8276000022888184},{"id":"https://openalex.org/keywords/code-refactoring","display_name":"Code refactoring","score":0.5605000257492065},{"id":"https://openalex.org/keywords/python","display_name":"Python (programming language)","score":0.4884999990463257},{"id":"https://openalex.org/keywords/software-quality","display_name":"Software quality","score":0.4431999921798706},{"id":"https://openalex.org/keywords/scripting-language","display_name":"Scripting language","score":0.43959999084472656},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.43320000171661377},{"id":"https://openalex.org/keywords/maintainability","display_name":"Maintainability","score":0.4059000015258789},{"id":"https://openalex.org/keywords/artifact","display_name":"Artifact (error)","score":0.39169999957084656},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3912999927997589}],"concepts":[{"id":"https://openalex.org/C51929080","wikidata":"https://www.wikidata.org/wiki/Q2425187","display_name":"Codebase","level":3,"score":0.902899980545044},{"id":"https://openalex.org/C133237599","wikidata":"https://www.wikidata.org/wiki/Q2295111","display_name":"Code smell","level":5,"score":0.8276000022888184},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6574000120162964},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.5877000093460083},{"id":"https://openalex.org/C152752567","wikidata":"https://www.wikidata.org/wiki/Q116877","display_name":"Code refactoring","level":3,"score":0.5605000257492065},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.4884999990463257},{"id":"https://openalex.org/C117447612","wikidata":"https://www.wikidata.org/wiki/Q1412670","display_name":"Software quality","level":4,"score":0.4431999921798706},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.43959999084472656},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.43320000171661377},{"id":"https://openalex.org/C160713754","wikidata":"https://www.wikidata.org/wiki/Q1389965","display_name":"Maintainability","level":2,"score":0.4059000015258789},{"id":"https://openalex.org/C2779010991","wikidata":"https://www.wikidata.org/wiki/Q2720909","display_name":"Artifact (error)","level":2,"score":0.39169999957084656},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3912999927997589},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.3846000134944916},{"id":"https://openalex.org/C64869954","wikidata":"https://www.wikidata.org/wiki/Q1859747","display_name":"False positive paradox","level":2,"score":0.3718999922275543},{"id":"https://openalex.org/C137287247","wikidata":"https://www.wikidata.org/wiki/Q1329550","display_name":"Static program analysis","level":4,"score":0.3544999957084656},{"id":"https://openalex.org/C101317890","wikidata":"https://www.wikidata.org/wiki/Q940053","display_name":"Software maintenance","level":4,"score":0.3303000032901764},{"id":"https://openalex.org/C529173508","wikidata":"https://www.wikidata.org/wiki/Q638608","display_name":"Software development","level":3,"score":0.322299987077713},{"id":"https://openalex.org/C99613125","wikidata":"https://www.wikidata.org/wiki/Q165194","display_name":"Application programming interface","level":2,"score":0.32199999690055847},{"id":"https://openalex.org/C10272871","wikidata":"https://www.wikidata.org/wiki/Q929972","display_name":"Software inspection","level":5,"score":0.30469998717308044},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3043999969959259},{"id":"https://openalex.org/C1009929","wikidata":"https://www.wikidata.org/wiki/Q179550","display_name":"Software bug","level":3,"score":0.30219998955726624},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C149091818","wikidata":"https://www.wikidata.org/wiki/Q2429814","display_name":"Software system","level":3,"score":0.29109999537467957},{"id":"https://openalex.org/C170130773","wikidata":"https://www.wikidata.org/wiki/Q216378","display_name":"Usability","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C2776654903","wikidata":"https://www.wikidata.org/wiki/Q2601463","display_name":"SAFER","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2639999985694885},{"id":"https://openalex.org/C72280835","wikidata":"https://www.wikidata.org/wiki/Q635346","display_name":"Architectural pattern","level":5,"score":0.2637999951839447},{"id":"https://openalex.org/C202105479","wikidata":"https://www.wikidata.org/wiki/Q265013","display_name":"Software evolution","level":5,"score":0.2612999975681305},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.26089999079704285},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.2596000134944916},{"id":"https://openalex.org/C111065885","wikidata":"https://www.wikidata.org/wiki/Q1189053","display_name":"Fuzz testing","level":3,"score":0.25699999928474426},{"id":"https://openalex.org/C52913732","wikidata":"https://www.wikidata.org/wiki/Q857102","display_name":"Software design","level":4,"score":0.2538999915122986}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.07001","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.07001","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.07001","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.07001","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.6678057312965393,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Architectural":[0],"code":[1,47,62,189,217],"smells":[2,48,116,136],"erode":[3],"software":[4,212],"maintainability":[5],"and":[6,27,39,77,95,163,191,218],"are":[7,137,167],"costly":[8],"to":[9,44,155],"repair":[10,45,90,161],"manually,":[11],"yet":[12],"unlike":[13],"localized":[14,188],"bugs,":[15],"they":[16],"require":[17],"cross-module":[18,197],"reasoning":[19],"about":[20],"design":[21],"intent":[22],"that":[23,72,87,132],"challenges":[24],"both":[25],"developers":[26],"automated":[28,211],"tools.":[29],"While":[30],"large":[31],"language":[32],"model":[33,106],"agents":[34,59],"excel":[35],"at":[36,220],"bug":[37],"fixing":[38],"code-level":[40],"refactoring,":[41],"their":[42],"ability":[43],"architectural":[46,61,115,193],"remains":[49],"unexplored.":[50],"We":[51,65,99,214],"present":[52],"the":[53,121,141,170,192],"first":[54],"empirical":[55],"evaluation":[56],"of":[57,134,210],"LLM":[58,185],"on":[60,112,206],"smell":[63],"repair.":[64],"contribute":[66],"SmellBench,":[67],"a":[68,84,145,181],"task":[69],"orchestration":[70],"framework":[71],"incorporates":[73],"smell-type-specific":[74],"optimized":[75],"prompts":[76],"supports":[78],"iterative":[79],"multi-step":[80],"execution,":[81],"together":[82],"with":[83,153],"scoring":[85],"methodology":[86],"separately":[88],"evaluates":[89],"effectiveness,":[91],"false":[92,138,151],"positive":[93],"identification,":[94],"net":[96,164],"codebase":[97,165],"impact.":[98],"evaluate":[100],"11":[101],"agent":[102,143,173],"configurations":[103],"from":[104],"four":[105],"families":[107],"(GPT,":[108],"Claude,":[109],"Gemini,":[110],"Mistral)":[111],"65":[113],"hard-severity":[114],"detected":[117,135],"by":[118],"PyExamine":[119],"in":[120,187],"Python":[122],"project":[123],"scikit-learn,":[124],"validated":[125],"against":[126],"expert":[127,158],"judgments.":[128],"Expert":[129],"validation":[130],"reveals":[131],"63.1%":[133],"positives,":[139],"while":[140],"best":[142],"achieves":[144],"47.7%":[146],"resolution":[147],"rate.":[148],"Agents":[149],"identify":[150],"positives":[152],"up":[154],"$\u03ba=":[156],"0.94$":[157],"agreement,":[159],"but":[160],"aggressiveness":[162],"quality":[166],"inversely":[168],"related:":[169],"most":[171],"aggressive":[172],"introduces":[174],"140":[175],"new":[176],"smells.":[177],"These":[178],"findings":[179],"expose":[180],"gap":[182],"between":[183],"current":[184],"capabilities":[186],"transformations":[190],"understanding":[194],"needed":[195],"for":[196,203],"refactoring.":[198],"SmellBench":[199],"provides":[200],"reusable":[201],"infrastructure":[202],"tracking":[204],"progress":[205],"this":[207],"underexplored":[208],"dimension":[209],"engineering.":[213],"release":[215],"our":[216],"data":[219],"https://doi.org/10.5281/zenodo.19247588.":[221]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-12T00:00:00"}
