{"id":"https://openalex.org/W6929420183","doi":"https://doi.org/10.48550/arxiv.2507.23357","title":"Foundations and Models in Modern Computer Vision: Key Building Blocks in Landmark Architectures","display_name":"Foundations and Models in Modern Computer Vision: Key Building Blocks in Landmark Architectures","publication_year":2025,"publication_date":"2025-07-31","ids":{"openalex":"https://openalex.org/W6929420183","doi":"https://doi.org/10.48550/arxiv.2507.23357"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2507.23357","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2507.23357","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2507.23357","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Bourceanu, Radu-Andrei","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bourceanu, Radu-Andrei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"De La Fuente, Neil","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"De La Fuente, Neil","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Grimm, Jan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Grimm, Jan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Jardan, Andrei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jardan, Andrei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Manucharyan, Andriy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Manucharyan, Andriy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Weiss, Cornelius","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weiss, Cornelius","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cremers, Daniel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cremers, Daniel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Pflugfelder, Roman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pflugfelder, Roman","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.6086999773979187,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.6086999773979187,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.09109999984502792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.052400000393390656,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminator","display_name":"Discriminator","score":0.6039000153541565},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5509999990463257},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5115000009536743},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.48660001158714294},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.46790000796318054},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.4438000023365021},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.42089998722076416},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.4180000126361847},{"id":"https://openalex.org/keywords/generator","display_name":"Generator (circuit theory)","score":0.415800005197525}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7694000005722046},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6445000171661377},{"id":"https://openalex.org/C2779803651","wikidata":"https://www.wikidata.org/wiki/Q5282088","display_name":"Discriminator","level":3,"score":0.6039000153541565},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5509999990463257},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5115000009536743},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.48660001158714294},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.46790000796318054},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.4438000023365021},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.42089998722076416},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.420199990272522},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.4180000126361847},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.415800005197525},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4108999967575073},{"id":"https://openalex.org/C2780297707","wikidata":"https://www.wikidata.org/wiki/Q4895393","display_name":"Landmark","level":2,"score":0.3702000081539154},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.365200012922287},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.36149999499320984},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3508000075817108},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3208000063896179},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.30480000376701355},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.30320000648498535},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3010999858379364},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.2985000014305115},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.290800005197525},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.2833999991416931},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.2741999924182892},{"id":"https://openalex.org/C193415008","wikidata":"https://www.wikidata.org/wiki/Q639681","display_name":"Network architecture","level":2,"score":0.27149999141693115},{"id":"https://openalex.org/C184408114","wikidata":"https://www.wikidata.org/wiki/Q1502022","display_name":"Generative Design","level":3,"score":0.2696000039577484},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.2648000121116638},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.25}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2507.23357","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2507.23357","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2507.23357","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2507.23357","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6568465232849121,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0],"report":[1],"analyzes":[2],"the":[3,35,51,62,71,148,179],"evolution":[4],"of":[5,43,67,73,181],"key":[6],"design":[7,203],"patterns":[8],"in":[9,133,171],"computer":[10],"vision":[11,219],"by":[12,60,127],"examining":[13],"six":[14],"influential":[15],"papers.":[16],"The":[17],"analysis":[18],"begins":[19],"with":[20,143,187,194],"foundational":[21],"architectures":[22],"for":[23,76,96,151,216],"image":[24,68,78,152],"recognition.":[25,79],"We":[26,192],"review":[27],"ResNet,":[28],"which":[29,55,102,121,172,198],"introduced":[30],"residual":[31],"connections":[32],"to":[33,65,109,177,204],"overcome":[34],"vanishing":[36],"gradient":[37],"problem":[38],"and":[39,213],"enable":[40],"effective":[41,214],"training":[42,100],"significantly":[44],"deeper":[45],"convolutional":[46],"networks.":[47],"Subsequently,":[48],"we":[49,86,155],"examine":[50],"Vision":[52],"Transformer":[53,63],"(ViT),":[54],"established":[56],"a":[57,104,107,129,134,168,173,182,210],"new":[58],"paradigm":[59],"applying":[61],"architecture":[64],"sequences":[66],"patches,":[69],"demonstrating":[70],"efficacy":[72],"attention-based":[74],"models":[75],"large-scale":[77,218],"Building":[80],"on":[81,163],"these":[82],"visual":[83],"representation":[84],"backbones,":[85],"investigate":[87],"generative":[88,125],"models.":[89,220],"Generative":[90],"Adversarial":[91],"Networks":[92],"(GANs)":[93],"are":[94,119],"analyzed":[95],"their":[97],"novel":[98],"adversarial":[99],"process,":[101],"challenges":[103],"generator":[105],"against":[106],"discriminator":[108],"learn":[110],"complex":[111],"data":[112],"distributions.":[113],"Then,":[114],"Latent":[115],"Diffusion":[116],"Models":[117],"(LDMs)":[118],"covered,":[120],"improve":[122],"upon":[123],"prior":[124],"methods":[126],"performing":[128],"sequential":[130],"denoising":[131],"process":[132],"perceptually":[135],"compressed":[136],"latent":[137],"space.":[138],"LDMs":[139],"achieve":[140],"high-fidelity":[141],"synthesis":[142],"greater":[144],"computational":[145],"efficiency,":[146],"representing":[147],"current":[149],"state-of-the-art":[150],"generation.":[153],"Finally,":[154],"explore":[156],"self-supervised":[157],"learning":[158],"techniques":[159],"that":[160],"reduce":[161],"dependency":[162],"labeled":[164],"data.":[165],"DINO":[166],"is":[167],"self-distillation":[169],"framework":[170],"student":[174],"network":[175],"learns":[176],"match":[178],"output":[180],"momentum-updated":[183],"teacher,":[184],"yielding":[185],"features":[186],"strong":[188],"k-NN":[189],"classification":[190],"performance.":[191],"conclude":[193],"Masked":[195],"Autoencoders":[196],"(MAE),":[197],"utilize":[199],"an":[200],"asymmetric":[201],"encoder-decoder":[202],"reconstruct":[205],"heavily":[206],"masked":[207],"inputs,":[208],"providing":[209],"highly":[211],"scalable":[212],"method":[215],"pre-training":[217]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
