{"id":"https://openalex.org/W3159619744","doi":"https://doi.org/10.1109/iccv48922.2021.00180","title":"MDETR - Modulated Detection for End-to-End Multi-Modal Understanding","display_name":"MDETR - Modulated Detection for End-to-End Multi-Modal Understanding","publication_year":2021,"publication_date":"2021-10-01","ids":{"openalex":"https://openalex.org/W3159619744","doi":"https://doi.org/10.1109/iccv48922.2021.00180","mag":"3159619744"},"language":"en","primary_location":{"id":"doi:10.1109/iccv48922.2021.00180","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv48922.2021.00180","pdf_url":null,"source":{"id":"https://openalex.org/S4363607764","display_name":"2021 IEEE/CVF International Conference on Computer Vision (ICCV)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5077791956","display_name":"Aishwarya Kamath","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Aishwarya Kamath","raw_affiliation_strings":["NYU Center for Data Science"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NYU Center for Data Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060023106","display_name":"Mannat Singh","orcid":null},"institutions":[{"id":"https://openalex.org/I2252078561","display_name":"Meta (Israel)","ror":"https://ror.org/02388em19","country_code":"IL","type":"company","lineage":["https://openalex.org/I2252078561","https://openalex.org/I4210114444"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Mannat Singh","raw_affiliation_strings":["Facebook AI Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Facebook AI Research","institution_ids":["https://openalex.org/I2252078561"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001226970","display_name":"Yann LeCun","orcid":null},"institutions":[{"id":"https://openalex.org/I2252078561","display_name":"Meta (Israel)","ror":"https://ror.org/02388em19","country_code":"IL","type":"company","lineage":["https://openalex.org/I2252078561","https://openalex.org/I4210114444"]},{"id":"https://openalex.org/I36672615","display_name":"Courant Institute of Mathematical Sciences","ror":"https://ror.org/037tm7f56","country_code":"US","type":"education","lineage":["https://openalex.org/I36672615","https://openalex.org/I57206974"]}],"countries":["IL","US"],"is_corresponding":false,"raw_author_name":"Yann LeCun","raw_affiliation_strings":["NYU Center for Data Science","NYU Courant Institute","Facebook AI Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NYU Center for Data Science","institution_ids":[]},{"raw_affiliation_string":"NYU Courant Institute","institution_ids":["https://openalex.org/I36672615"]},{"raw_affiliation_string":"Facebook AI Research","institution_ids":["https://openalex.org/I2252078561"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041907084","display_name":"Gabriel Synnaeve","orcid":"https://orcid.org/0000-0003-1715-3356"},"institutions":[{"id":"https://openalex.org/I2252078561","display_name":"Meta (Israel)","ror":"https://ror.org/02388em19","country_code":"IL","type":"company","lineage":["https://openalex.org/I2252078561","https://openalex.org/I4210114444"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Gabriel Synnaeve","raw_affiliation_strings":["Facebook AI Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Facebook AI Research","institution_ids":["https://openalex.org/I2252078561"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000623592","display_name":"Ishan Misra","orcid":"https://orcid.org/0000-0001-7708-7261"},"institutions":[{"id":"https://openalex.org/I2252078561","display_name":"Meta (Israel)","ror":"https://ror.org/02388em19","country_code":"IL","type":"company","lineage":["https://openalex.org/I2252078561","https://openalex.org/I4210114444"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Ishan Misra","raw_affiliation_strings":["Facebook AI Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Facebook AI Research","institution_ids":["https://openalex.org/I2252078561"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5029981206","display_name":"Nicolas Carion","orcid":"https://orcid.org/0000-0002-2308-9680"},"institutions":[{"id":"https://openalex.org/I36672615","display_name":"Courant Institute of Mathematical Sciences","ror":"https://ror.org/037tm7f56","country_code":"US","type":"education","lineage":["https://openalex.org/I36672615","https://openalex.org/I57206974"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nicolas Carion","raw_affiliation_strings":["NYU Courant Institute"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NYU Courant Institute","institution_ids":["https://openalex.org/I36672615"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5077791956"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":31.1174,"has_fulltext":false,"cited_by_count":660,"citation_normalized_percentile":{"value":0.99799427,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1760","last_page":"1770"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9912999868392944,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.989300012588501,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8206334710121155},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.637283205986023},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5989469289779663},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5941208004951477},{"id":"https://openalex.org/keywords/detector","display_name":"Detector","score":0.5316624641418457},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.5041342973709106},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.5009992122650146},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5008478164672852},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.4799497425556183},{"id":"https://openalex.org/keywords/phrase","display_name":"Phrase","score":0.4107394516468048},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4104197919368744},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3994423449039459}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8206334710121155},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.637283205986023},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5989469289779663},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5941208004951477},{"id":"https://openalex.org/C94915269","wikidata":"https://www.wikidata.org/wiki/Q1834857","display_name":"Detector","level":2,"score":0.5316624641418457},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.5041342973709106},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.5009992122650146},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5008478164672852},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.4799497425556183},{"id":"https://openalex.org/C2776224158","wikidata":"https://www.wikidata.org/wiki/Q187931","display_name":"Phrase","level":2,"score":0.4107394516468048},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4104197919368744},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3994423449039459},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iccv48922.2021.00180","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv48922.2021.00180","pdf_url":null,"source":{"id":"https://openalex.org/S4363607764","display_name":"2021 IEEE/CVF International Conference on Computer Vision (ICCV)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7599999904632568,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":112,"referenced_works":["https://openalex.org/W1773149199","https://openalex.org/W1861492603","https://openalex.org/W2109586012","https://openalex.org/W2194775991","https://openalex.org/W2277195237","https://openalex.org/W2489434015","https://openalex.org/W2560730294","https://openalex.org/W2561715562","https://openalex.org/W2745461083","https://openalex.org/W2760103357","https://openalex.org/W2793546384","https://openalex.org/W2798556392","https://openalex.org/W2798991696","https://openalex.org/W2842511635","https://openalex.org/W2886641317","https://openalex.org/W2891021031","https://openalex.org/W2891999386","https://openalex.org/W2907143950","https://openalex.org/W2908510526","https://openalex.org/W2948672349","https://openalex.org/W2955425717","https://openalex.org/W2962716332","https://openalex.org/W2962766617","https://openalex.org/W2962784628","https://openalex.org/W2962811161","https://openalex.org/W2962914239","https://openalex.org/W2963109634","https://openalex.org/W2963223524","https://openalex.org/W2963224792","https://openalex.org/W2963351448","https://openalex.org/W2963518342","https://openalex.org/W2963521239","https://openalex.org/W2963738360","https://openalex.org/W2963783181","https://openalex.org/W2963907629","https://openalex.org/W2964022527","https://openalex.org/W2964121744","https://openalex.org/W2964345792","https://openalex.org/W2965373594","https://openalex.org/W2966715458","https://openalex.org/W2968124245","https://openalex.org/W2969876226","https://openalex.org/W2970231061","https://openalex.org/W2970608575","https://openalex.org/W2975501350","https://openalex.org/W2978017171","https://openalex.org/W2979826702","https://openalex.org/W2987734933","https://openalex.org/W2990397898","https://openalex.org/W2995460200","https://openalex.org/W3004019157","https://openalex.org/W3018442911","https://openalex.org/W3034727271","https://openalex.org/W3034764937","https://openalex.org/W3034971973","https://openalex.org/W3035160371","https://openalex.org/W3035552357","https://openalex.org/W3035688398","https://openalex.org/W3038476992","https://openalex.org/W3091588028","https://openalex.org/W3092198590","https://openalex.org/W3096609285","https://openalex.org/W3098824823","https://openalex.org/W3099849198","https://openalex.org/W3109644378","https://openalex.org/W3116853161","https://openalex.org/W3116952214","https://openalex.org/W3118500473","https://openalex.org/W3120237956","https://openalex.org/W3127023342","https://openalex.org/W3135367836","https://openalex.org/W3143320354","https://openalex.org/W3166396011","https://openalex.org/W3167118264","https://openalex.org/W4230419477","https://openalex.org/W4287353120","https://openalex.org/W4288286281","https://openalex.org/W4297749157","https://openalex.org/W4297808394","https://openalex.org/W4385245566","https://openalex.org/W6638575559","https://openalex.org/W6639102338","https://openalex.org/W6676497082","https://openalex.org/W6694395031","https://openalex.org/W6738893770","https://openalex.org/W6739901393","https://openalex.org/W6740674931","https://openalex.org/W6746089280","https://openalex.org/W6748270630","https://openalex.org/W6752083267","https://openalex.org/W6754944153","https://openalex.org/W6755014234","https://openalex.org/W6762718338","https://openalex.org/W6765591853","https://openalex.org/W6766673545","https://openalex.org/W6766904570","https://openalex.org/W6767211374","https://openalex.org/W6767279747","https://openalex.org/W6768438993","https://openalex.org/W6775970589","https://openalex.org/W6776765564","https://openalex.org/W6778485988","https://openalex.org/W6779473860","https://openalex.org/W6780137884","https://openalex.org/W6786862242","https://openalex.org/W6787671520","https://openalex.org/W6788436139","https://openalex.org/W6788554130","https://openalex.org/W6789909235","https://openalex.org/W6789992408","https://openalex.org/W6791353385","https://openalex.org/W6844194202"],"related_works":["https://openalex.org/W2151749779","https://openalex.org/W2039546652","https://openalex.org/W3179968364","https://openalex.org/W2012262991","https://openalex.org/W1999612375","https://openalex.org/W2373794620","https://openalex.org/W2060629350","https://openalex.org/W2938107654","https://openalex.org/W2357294589","https://openalex.org/W3173456895"],"abstract_inverted_index":{"Multi-modal":[0],"reasoning":[1],"systems":[2,49],"rely":[3],"on":[4,35,80,119,143,159,174,221],"a":[5,25,36,81,86,89,93,175,182,192],"pre-trained":[6],"object":[7,172,200],"detector":[8,72,173],"to":[9,50,96,194],"extract":[10],"regions":[11],"of":[12,30,39,55,112,167,199],"interest":[13],"from":[14,124],"the":[15,31,52,105,113,117,138,165,196],"image.":[16,139],"However,":[17],"this":[18,64],"crucial":[19],"module":[20],"is":[21],"typically":[22],"used":[23],"as":[24,148,170],"black":[26],"box,":[27],"trained":[28],"independently":[29],"downstream":[32,145],"task":[33],"and":[34,41,101,135,154,223,227],"fixed":[37],"vocabulary":[38],"objects":[40,75,136],"attributes.":[42],"This":[43],"makes":[44],"it":[45],"challenging":[46],"for":[47,214],"such":[48,147],"capture":[51],"long":[53,197],"tail":[54,198],"visual":[56,215],"concepts":[57],"expressed":[58],"in":[59,76,133,137,181],"free":[60],"form":[61],"text.":[62],"In":[63],"paper":[65],"we":[66],"propose":[67],"MDETR,":[68],"an":[69,77,109,171],"end-to-end":[70],"modulated":[71],"that":[73,187],"detects":[74],"image":[78,102],"conditioned":[79],"raw":[82],"text":[83,100,134],"query,":[84],"like":[85],"caption":[87],"or":[88],"question.":[90],"We":[91,115,140,162,185],"use":[92],"transformer-based":[94],"architecture":[95],"reason":[97],"jointly":[98],"over":[99],"by":[103],"fusing":[104],"two":[106],"modalities":[107],"at":[108,231],"early":[110],"stage":[111],"model.":[114],"pre-train":[116],"network":[118],"1.3M":[120],"text-image":[121],"pairs,":[122],"mined":[123],"pre-existing":[125],"multi-modal":[126],"datasets":[127],"having":[128],"explicit":[129],"alignment":[130],"between":[131],"phrases":[132],"then":[141],"fine-tune":[142],"several":[144],"tasks":[146],"phrase":[149],"grounding,":[150],"referring":[151],"expression":[152],"comprehension":[153],"segmentation,":[155],"achieving":[156,218],"state-of-the-art":[157],"results":[158],"popular":[160],"benchmarks.":[161],"also":[163],"investigate":[164],"utility":[166],"our":[168,188],"model":[169],"given":[176],"label":[177],"set":[178],"when":[179],"fine-tuned":[180],"few-shot":[183],"setting.":[184],"show":[186],"pre-training":[189],"approach":[190,209],"provides":[191],"way":[193],"handle":[195],"categories":[201],"which":[202],"have":[203],"very":[204],"few":[205],"labelled":[206],"instances.":[207],"Our":[208],"can":[210],"be":[211],"easily":[212],"extended":[213],"question":[216],"answering,":[217],"competitive":[219],"performance":[220],"GQA":[222],"CLEVR.":[224],"The":[225],"code":[226],"models":[228],"are":[229],"available":[230],"https://github.com/ashkamath/mdetr.":[232]},"counts_by_year":[{"year":2026,"cited_by_count":30},{"year":2025,"cited_by_count":157},{"year":2024,"cited_by_count":180},{"year":2023,"cited_by_count":194},{"year":2022,"cited_by_count":84},{"year":2021,"cited_by_count":15}],"updated_date":"2026-05-10T08:33:47.465468","created_date":"2025-10-10T00:00:00"}
