{"id":"https://openalex.org/W4415209236","doi":"https://doi.org/10.1007/978-3-032-08324-1_16","title":"Interpreting the\u00a0Structure of\u00a0Multi-object Representations in\u00a0Vision Encoders","display_name":"Interpreting the\u00a0Structure of\u00a0Multi-object Representations in\u00a0Vision Encoders","publication_year":2025,"publication_date":"2025-10-15","ids":{"openalex":"https://openalex.org/W4415209236","doi":"https://doi.org/10.1007/978-3-032-08324-1_16"},"language":"en","primary_location":{"id":"doi:10.1007/978-3-032-08324-1_16","is_oa":true,"landing_page_url":"https://doi.org/10.1007/978-3-032-08324-1_16","pdf_url":"https://link.springer.com/content/pdf/10.1007/978-3-032-08324-1_16.pdf","source":{"id":"https://openalex.org/S2764900261","display_name":"Communications in computer and information science","issn_l":"1865-0929","issn":["1865-0929","1865-0937"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"book series"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Communications in Computer and Information Science","raw_type":"book-chapter"},"type":"book-chapter","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/978-3-032-08324-1_16.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5018395014","display_name":"Tarun Khajuria","orcid":"https://orcid.org/0000-0002-7089-659X"},"institutions":[{"id":"https://openalex.org/I56085075","display_name":"University of Tartu","ror":"https://ror.org/03z77qz90","country_code":"EE","type":"education","lineage":["https://openalex.org/I56085075"]}],"countries":["EE"],"is_corresponding":true,"raw_author_name":"Tarun Khajuria","raw_affiliation_strings":["Institute of Computer Science, University of Tartu, Tartu, Estonia"],"affiliations":[{"raw_affiliation_string":"Institute of Computer Science, University of Tartu, Tartu, Estonia","institution_ids":["https://openalex.org/I56085075"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099173730","display_name":"Braian Olmiro Dias","orcid":"https://orcid.org/0009-0000-1170-7464"},"institutions":[{"id":"https://openalex.org/I56085075","display_name":"University of Tartu","ror":"https://ror.org/03z77qz90","country_code":"EE","type":"education","lineage":["https://openalex.org/I56085075"]}],"countries":["EE"],"is_corresponding":false,"raw_author_name":"Braian Olmiro Dias","raw_affiliation_strings":["Institute of Computer Science, University of Tartu, Tartu, Estonia"],"affiliations":[{"raw_affiliation_string":"Institute of Computer Science, University of Tartu, Tartu, Estonia","institution_ids":["https://openalex.org/I56085075"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007012338","display_name":"Marharyta Domnich","orcid":"https://orcid.org/0000-0001-5414-6089"},"institutions":[{"id":"https://openalex.org/I56085075","display_name":"University of Tartu","ror":"https://ror.org/03z77qz90","country_code":"EE","type":"education","lineage":["https://openalex.org/I56085075"]}],"countries":["EE"],"is_corresponding":false,"raw_author_name":"Marharyta Domnich","raw_affiliation_strings":["Institute of Computer Science, University of Tartu, Tartu, Estonia"],"affiliations":[{"raw_affiliation_string":"Institute of Computer Science, University of Tartu, Tartu, Estonia","institution_ids":["https://openalex.org/I56085075"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5087072092","display_name":"Jaan Aru","orcid":"https://orcid.org/0000-0003-3927-452X"},"institutions":[{"id":"https://openalex.org/I56085075","display_name":"University of Tartu","ror":"https://ror.org/03z77qz90","country_code":"EE","type":"education","lineage":["https://openalex.org/I56085075"]}],"countries":["EE"],"is_corresponding":false,"raw_author_name":"Jaan Aru","raw_affiliation_strings":["Institute of Computer Science, University of Tartu, Tartu, Estonia"],"affiliations":[{"raw_affiliation_string":"Institute of Computer Science, University of Tartu, Tartu, Estonia","institution_ids":["https://openalex.org/I56085075"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5018395014"],"corresponding_institution_ids":["https://openalex.org/I56085075"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.64771429,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"359","last_page":"382"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7689999938011169},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6747999787330627},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.6341000199317932},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6259999871253967},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6000000238418579},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5902000069618225},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.47699999809265137},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.45170000195503235},{"id":"https://openalex.org/keywords/measure","display_name":"Measure (data warehouse)","score":0.44760000705718994}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8162000179290771},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7689999938011169},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6747999787330627},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.6341000199317932},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6259999871253967},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6000000238418579},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5964999794960022},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5902000069618225},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4975999891757965},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.47699999809265137},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.45170000195503235},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.44760000705718994},{"id":"https://openalex.org/C2780615836","wikidata":"https://www.wikidata.org/wiki/Q2471869","display_name":"USable","level":2,"score":0.42739999294281006},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.40700000524520874},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3253999948501587},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3246000111103058},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.31049999594688416},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2946999967098236},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.29269999265670776},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.28700000047683716},{"id":"https://openalex.org/C87868495","wikidata":"https://www.wikidata.org/wiki/Q750843","display_name":"Information processing","level":2,"score":0.2833999991416931},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.2766999900341034},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.27559998631477356},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2694999873638153},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.267300009727478},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.26030001044273376},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.25519999861717224}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/978-3-032-08324-1_16","is_oa":true,"landing_page_url":"https://doi.org/10.1007/978-3-032-08324-1_16","pdf_url":"https://link.springer.com/content/pdf/10.1007/978-3-032-08324-1_16.pdf","source":{"id":"https://openalex.org/S2764900261","display_name":"Communications in computer and information science","issn_l":"1865-0929","issn":["1865-0929","1865-0937"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"book series"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Communications in Computer and Information Science","raw_type":"book-chapter"}],"best_oa_location":{"id":"doi:10.1007/978-3-032-08324-1_16","is_oa":true,"landing_page_url":"https://doi.org/10.1007/978-3-032-08324-1_16","pdf_url":"https://link.springer.com/content/pdf/10.1007/978-3-032-08324-1_16.pdf","source":{"id":"https://openalex.org/S2764900261","display_name":"Communications in computer and information science","issn_l":"1865-0929","issn":["1865-0929","1865-0937"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"book series"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Communications in Computer and Information Science","raw_type":"book-chapter"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3514550006","display_name":null,"funder_award_id":"Centre","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G3984813526","display_name":null,"funder_award_id":"PSG728","funder_id":"https://openalex.org/F4320321090","funder_display_name":"Eesti Teadusagentuur"},{"id":"https://openalex.org/G4272660758","display_name":null,"funder_award_id":"952060","funder_id":"https://openalex.org/F4320321090","funder_display_name":"Eesti Teadusagentuur"},{"id":"https://openalex.org/G8051717526","display_name":null,"funder_award_id":"Grant","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G8318064016","display_name":null,"funder_award_id":"Horizon","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G8925289741","display_name":null,"funder_award_id":"952060","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"}],"funders":[{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"},{"id":"https://openalex.org/F4320321090","display_name":"Eesti Teadusagentuur","ror":"https://ror.org/00jjeja18"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4415209236.pdf","grobid_xml":"https://content.openalex.org/works/W4415209236.grobid-xml"},"referenced_works_count":20,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W1933349210","https://openalex.org/W2277195237","https://openalex.org/W2561715562","https://openalex.org/W2886281300","https://openalex.org/W2945976633","https://openalex.org/W2970862333","https://openalex.org/W3106784008","https://openalex.org/W3138639721","https://openalex.org/W3138819813","https://openalex.org/W3159481202","https://openalex.org/W4253689767","https://openalex.org/W4312791030","https://openalex.org/W4312910992","https://openalex.org/W4313119036","https://openalex.org/W4379347837","https://openalex.org/W4386071535","https://openalex.org/W4387949708","https://openalex.org/W4401751130","https://openalex.org/W4406412134"],"related_works":[],"abstract_inverted_index":{"Abstract":[0],"In":[1],"this":[2,194],"work,":[3],"we":[4,109,226,249,278],"interpret":[5],"the":[6,15,34,132,141,154,181,190,199,232,253],"representations":[7,21,75,95,134,164,215,258],"of":[8,17,24,99,143,183,235,255],"multi-object":[9,151],"scenes":[10,152],"in":[11,49,66,150,180,198,239,259],"vision":[12,173,243,260],"encoders":[13,114,244],"through":[14],"lens":[16],"structured":[18,74,214,236,257],"representations.":[19],"Structured":[20],"allow":[22],"modeling":[23],"individual":[25,148,220],"objects":[26,149,184],"distinctly":[27],"and":[28,40,52,61,69,92,111,125,169,209,241,266,274],"their":[29,187,264,289],"flexible":[30],"use":[31],"based":[32],"on":[33,63,106,116,186],"task":[35,285],"context":[36],"for":[37,204,245,282],"both":[38],"scene-level":[39],"object-specific":[41],"tasks.":[42,247],"These":[43],"capabilities":[44],"play":[45],"a":[46,67],"central":[47],"role":[48],"human":[50,294],"reasoning":[51],"generalization,":[53],"allowing":[54],"us":[55],"to":[56,80,101,146,189,230,251],"abstract":[57],"away":[58],"irrelevant":[59],"details":[60],"focus":[62],"relevant":[64],"information":[65,87,218],"compact":[68],"usable":[70],"form.":[71],"We":[72,130],"define":[73],"as":[76],"those":[77],"that":[78,139,211],"adhere":[79],"two":[81,233],"specific":[82,85,144],"properties:":[83],"binding":[84],"object":[86,94],"into":[88,96,161],"discrete":[89],"representation":[90,182],"units":[91],"segregating":[93],"separate":[97],"sets":[98],"tokens":[100,145,168],"minimize":[102],"cross-object":[103],"entanglement.":[104],"Based":[105],"these":[107,172,271],"properties,":[108],"evaluated":[110],"compared":[112],"image":[113],"pre-trained":[115],"classification":[117],"(ViT),":[118],"large":[119],"vision-language":[120],"models":[121,272],"(CLIP,":[122],"BLIP,":[123],"FLAVA),":[124],"self-supervised":[126],"methods":[127],"(DINO,":[128],"DINOv2).":[129],"examine":[131],"token":[133,201],"by":[135],"creating":[136],"object-decoding":[137],"tasks":[138],"measure":[140],"ability":[142],"capture":[147],"from":[153],"COCO":[155],"dataset.":[156],"This":[157],"analysis":[158],"provides":[159],"insights":[160],"how":[162,270],"object-wise":[163,256],"are":[165],"distributed":[166],"across":[167],"layers":[170,210],"within":[171],"encoders.":[174],"Our":[175],"findings":[176],"highlight":[177],"significant":[178],"differences":[179],"depending":[185],"relevance":[188],"pre-training":[191],"objective,":[192],"with":[193,293],"effect":[195],"particularly":[196],"pronounced":[197],"CLS":[200],"(often":[202],"used":[203],"downstream":[205,246,284],"tasks).":[206],"Meanwhile,":[207],"networks":[208],"exhibit":[212],"more":[213,291],"retain":[216],"better":[217],"about":[219],"objects.":[221],"To":[222],"guide":[223],"practical":[224],"applications,":[225],"propose":[227],"formal":[228],"measures":[229],"quantify":[231],"properties":[234],"representations,":[237],"aiding":[238],"selecting":[240],"adapting":[242],"Overall,":[248],"aim":[250],"advance":[252],"understanding":[254],"encoders,":[261],"thus":[262],"enhancing":[263],"transparency":[265],"interpretability.":[267],"By":[268],"clarifying":[269],"bind":[273],"segregate":[275],"object-level":[276],"information,":[277],"enable":[279],"better-informed":[280],"decisions":[281],"optimal":[283],"adaptation,":[286],"ultimately":[287],"aligning":[288],"behaviour":[290],"closely":[292],"reasoning.":[295]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-16T00:00:00"}
