{"id":"https://openalex.org/W7152386181","doi":"https://doi.org/10.48550/arxiv.2604.07021","title":"ModuSeg: Decoupling Object Discovery and Semantic Retrieval for Training-Free Weakly Supervised Segmentation","display_name":"ModuSeg: Decoupling Object Discovery and Semantic Retrieval for Training-Free Weakly Supervised Segmentation","publication_year":2026,"publication_date":"2026-04-08","ids":{"openalex":"https://openalex.org/W7152386181","doi":"https://doi.org/10.48550/arxiv.2604.07021"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.07021","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07021","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.07021","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101239063","display_name":"Qingze He","orcid":"https://orcid.org/0009-0009-2055-6958"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Qingze","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047183141","display_name":"Fagiu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Fagui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030715932","display_name":"D H Zhang","orcid":"https://orcid.org/0009-0001-2941-0084"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Dengke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102677150","display_name":"Qingmao Wei","orcid":"https://orcid.org/0000-0001-9982-3119"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Qingmao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133264789","display_name":"Quan Tang","orcid":"https://orcid.org/0000-0001-7077-7673"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Quan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.8203999996185303,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.8203999996185303,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.062300000339746475,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.026799999177455902,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6866999864578247},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6704000234603882},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.474700003862381},{"id":"https://openalex.org/keywords/neural-coding","display_name":"Neural coding","score":0.46299999952316284},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.45899999141693115},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.45019999146461487},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4410000145435333},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.36899998784065247},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.34450000524520874}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7781000137329102},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6866999864578247},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6704000234603882},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6327000260353088},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.474700003862381},{"id":"https://openalex.org/C77637269","wikidata":"https://www.wikidata.org/wiki/Q7002051","display_name":"Neural coding","level":2,"score":0.46299999952316284},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.45899999141693115},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.45019999146461487},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4410000145435333},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3765999972820282},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.36899998784065247},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.34450000524520874},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.33489999175071716},{"id":"https://openalex.org/C75608658","wikidata":"https://www.wikidata.org/wiki/Q44395","display_name":"Pascal (unit)","level":2,"score":0.32910001277923584},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3179999887943268},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.2937000095844269},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.28209999203681946},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.2777000069618225},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2676999866962433},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.267300009727478},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.26499998569488525},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.25589999556541443},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.07021","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07021","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.07021","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07021","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.7373331189155579}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Weakly":[0],"supervised":[1,76],"semantic":[2,16,77,87,105,125],"segmentation":[3,78,115],"aims":[4],"to":[5,25,48,96,108,133],"achieve":[6],"pixel-level":[7],"predictions":[8],"using":[9],"image-level":[10],"labels.":[11],"Existing":[12],"methods":[13],"typically":[14],"entangle":[15],"recognition":[17],"and":[18,53,86,128,138,161],"object":[19,84],"localization,":[20],"which":[21],"often":[22,54],"leads":[23],"models":[24,34,107],"focus":[26],"exclusively":[27],"on":[28,56,81,166],"sparse":[29],"discriminative":[30],"regions.":[31],"Although":[32],"foundation":[33,106],"show":[35],"immense":[36],"potential,":[37],"many":[38],"approaches":[39],"still":[40],"follow":[41],"the":[42,67,150],"tightly":[43],"coupled":[44],"optimization":[45],"paradigm,":[46],"struggling":[47],"effectively":[49,134],"alleviate":[50],"pseudo-label":[51],"noise":[52],"relying":[55],"time-consuming":[57],"multi-stage":[58],"retraining":[59],"or":[60],"unstable":[61],"end-to-end":[62],"joint":[63],"optimization.":[64],"To":[65],"address":[66],"above":[68],"challenges,":[69],"we":[70,90,123],"present":[71],"ModuSeg,":[72],"a":[73,92,117],"training-free":[74],"weakly":[75],"framework":[79],"centered":[80],"explicitly":[82],"decoupling":[83],"discovery":[85],"assignment.":[88],"Specifically,":[89],"integrate":[91],"general":[93],"mask":[94],"proposer":[95],"extract":[97],"geometric":[98],"proposals":[99],"with":[100],"reliable":[101],"boundaries,":[102],"while":[103],"leveraging":[104],"construct":[109],"an":[110],"offline":[111],"feature":[112,119,130],"bank,":[113],"transforming":[114],"into":[116],"non-parametric":[118],"retrieval":[120],"process.":[121],"Furthermore,":[122],"propose":[124],"boundary":[126,136],"purification":[127],"soft-masked":[129],"aggregation":[131],"strategies":[132],"mitigate":[135],"ambiguity":[137],"quantization":[139],"errors,":[140],"thereby":[141],"extracting":[142],"high-quality":[143],"category":[144],"prototypes.":[145],"Extensive":[146],"experiments":[147],"demonstrate":[148],"that":[149],"proposed":[151],"decoupled":[152],"architecture":[153],"better":[154],"preserves":[155],"fine":[156],"boundaries":[157],"without":[158],"parameter":[159],"fine-tuning":[160],"achieves":[162],"highly":[163],"competitive":[164],"performance":[165],"standard":[167],"benchmark":[168],"datasets.":[169],"Code":[170],"is":[171],"available":[172],"at":[173],"https://github.com/Autumnair007/ModuSeg.":[174]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-10T00:00:00"}
