{"id":"https://openalex.org/W7127178521","doi":"https://doi.org/10.48550/arxiv.2601.22952","title":"Sifting the Noise: A Comparative Study of LLM Agents in Vulnerability False Positive Filtering","display_name":"Sifting the Noise: A Comparative Study of LLM Agents in Vulnerability False Positive Filtering","publication_year":2026,"publication_date":"2026-01-30","ids":{"openalex":"https://openalex.org/W7127178521","doi":"https://doi.org/10.48550/arxiv.2601.22952"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.22952","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124828237","display_name":"Yunpeng Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xiong, Yunpeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124831487","display_name":"Ting Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ting","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5124828237"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12479","display_name":"Web Application Security Vulnerabilities","score":0.22699999809265137,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12479","display_name":"Web Application Security Vulnerabilities","score":0.22699999809265137,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.20409999787807465,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.0843999981880188,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/false-positive-paradox","display_name":"False positive paradox","score":0.7285000085830688},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.593500018119812},{"id":"https://openalex.org/keywords/false-positives-and-false-negatives","display_name":"False positives and false negatives","score":0.5681999921798706},{"id":"https://openalex.org/keywords/vulnerability","display_name":"Vulnerability (computing)","score":0.47110000252723694},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.4311000108718872},{"id":"https://openalex.org/keywords/false-positive-rate","display_name":"False positive rate","score":0.3856000006198883},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.384799987077713}],"concepts":[{"id":"https://openalex.org/C64869954","wikidata":"https://www.wikidata.org/wiki/Q1859747","display_name":"False positive paradox","level":2,"score":0.7285000085830688},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7031000256538391},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.593500018119812},{"id":"https://openalex.org/C112789634","wikidata":"https://www.wikidata.org/wiki/Q18207010","display_name":"False positives and false negatives","level":3,"score":0.5681999921798706},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.49729999899864197},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.47110000252723694},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.4311000108718872},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4138000011444092},{"id":"https://openalex.org/C95922358","wikidata":"https://www.wikidata.org/wiki/Q5432725","display_name":"False positive rate","level":2,"score":0.3856000006198883},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.384799987077713},{"id":"https://openalex.org/C2777120189","wikidata":"https://www.wikidata.org/wiki/Q780067","display_name":"Triage","level":2,"score":0.3517000079154968},{"id":"https://openalex.org/C548217200","wikidata":"https://www.wikidata.org/wiki/Q251","display_name":"Java","level":2,"score":0.32600000500679016},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.3061999976634979},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.3043999969959259},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.3037000000476837},{"id":"https://openalex.org/C3019252630","wikidata":"https://www.wikidata.org/wiki/Q6549547","display_name":"Limited resources","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2948000133037567},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.290800005197525},{"id":"https://openalex.org/C22680326","wikidata":"https://www.wikidata.org/wiki/Q7444867","display_name":"Secure coding","level":5,"score":0.26019999384880066}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.22952","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.22952","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.22952","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.22952","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Static":[0],"Application":[1],"Security":[2],"Testing":[3],"(SAST)":[4],"tools":[5],"are":[6,173,237],"essential":[7],"for":[8,65,90,184,200,243],"identifying":[9],"software":[10],"vulnerabilities,":[11,215],"but":[12,194,240],"they":[13],"often":[14],"produce":[15],"a":[16,24,40,76,238],"high":[17],"volume":[18],"of":[19,60,79,121,130,152,161,171,212,255],"false":[20],"positives":[21],"(FPs),":[22],"imposing":[23],"substantial":[25],"manual":[26],"triage":[27],"burden":[28],"on":[29,133],"developers.":[30],"Recent":[31],"advances":[32],"in":[33,142,224],"Large":[34],"Language":[35],"Model":[36],"(LLM)":[37],"agents":[38,116,154,172,236],"offer":[39],"promising":[41],"direction":[42],"by":[43],"enabling":[44],"iterative":[45],"reasoning,":[46],"tool":[47],"use,":[48],"and":[49,88,105,176,192,247,263],"environment":[50],"interaction":[51],"to":[52,137,163],"refine":[53],"SAST":[54,122,244],"alerts.":[55,167],"However,":[56,168],"the":[57,99,102,119,134,143,149,169,210],"comparative":[58,77],"effectiveness":[59],"different":[61],"LLM-based":[62,82,115,153,235],"agent":[63,83,228,256],"architectures":[64],"FP":[66,92,127,158,205,245],"filtering":[67],"remains":[68],"poorly":[69],"understood.":[70],"In":[71],"this":[72],"paper,":[73],"we":[74,220],"present":[75],"study":[78,232],"three":[80],"state-of-the-art":[81],"frameworks,":[84],"i.e.,":[85],"Aider,":[86],"OpenHands,":[87],"SWE-agent,":[89],"vulnerability":[91,261],"filtering.":[93],"We":[94],"evaluate":[95],"these":[96],"frameworks":[97,179],"using":[98],"vulnerabilities":[100],"from":[101],"OWASP":[103,135],"Benchmark":[104,136],"real-world":[106,147],"open-source":[107],"Java":[108],"projects.":[109],"The":[110],"experimental":[111],"results":[112],"show":[113],"that":[114,234,248],"can":[117,155,207],"remove":[118],"majority":[120],"noise,":[123],"reducing":[124],"an":[125,157],"initial":[126],"detection":[128],"rate":[129,160],"over":[131],"92%":[132],"as":[138,140,188],"low":[139],"6.3%":[141],"best":[144,150],"configuration.":[145],"On":[146],"dataset,":[148],"configuration":[151],"achieve":[156],"identification":[159],"up":[162],"93.3%":[164],"involving":[165],"CodeQL":[166],"benefits":[170],"strongly":[174],"backbone-":[175],"CWE-dependent:":[177],"agentic":[178],"significantly":[180],"outperform":[181],"vanilla":[182],"prompting":[183],"stronger":[185],"models":[186],"such":[187],"Claude":[189],"Sonnet":[190],"4":[191],"GPT-5,":[193],"yield":[195],"limited":[196],"or":[197],"inconsistent":[198],"gains":[199],"weaker":[201],"backbones.":[202],"Moreover,":[203],"aggressive":[204],"reduction":[206],"come":[208],"at":[209],"cost":[211,226],"suppressing":[213],"true":[214],"highlighting":[216],"important":[217],"trade-offs.":[218],"Finally,":[219],"observe":[221],"large":[222],"disparities":[223],"computational":[225],"across":[227],"frameworks.":[229],"Overall,":[230],"our":[231],"demonstrates":[233],"powerful":[239],"non-uniform":[241],"solution":[242],"filtering,":[246],"their":[249],"practical":[250],"deployment":[251],"requires":[252],"careful":[253],"consideration":[254],"design,":[257],"backbone":[258],"model":[259],"choice,":[260],"category,":[262],"operational":[264],"cost.":[265]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-03T00:00:00"}
