{"id":"https://openalex.org/W4304086156","doi":"https://doi.org/10.1145/3503161.3548150","title":"Image Understanding by Captioning with Differentiable Architecture Search","display_name":"Image Understanding by Captioning with Differentiable Architecture Search","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W4304086156","doi":"https://doi.org/10.1145/3503161.3548150"},"language":"en","primary_location":{"id":"doi:10.1145/3503161.3548150","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3503161.3548150","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3548150","source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3548150","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5077761682","display_name":"Ramtin Hosseini","orcid":"https://orcid.org/0000-0001-6618-1279"},"institutions":[{"id":"https://openalex.org/I36258959","display_name":"University of California, San Diego","ror":"https://ror.org/0168r3w48","country_code":"US","type":"education","lineage":["https://openalex.org/I36258959"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ramtin Hosseini","raw_affiliation_strings":["University of California, San Diego, San Diego, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, San Diego, San Diego, CA, USA","institution_ids":["https://openalex.org/I36258959"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5083884675","display_name":"Pengtao Xie","orcid":"https://orcid.org/0000-0003-0521-174X"},"institutions":[{"id":"https://openalex.org/I36258959","display_name":"University of California, San Diego","ror":"https://ror.org/0168r3w48","country_code":"US","type":"education","lineage":["https://openalex.org/I36258959"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Pengtao Xie","raw_affiliation_strings":["University of California, San Diego, San Diego, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, San Diego, San Diego, CA, USA","institution_ids":["https://openalex.org/I36258959"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5077761682"],"corresponding_institution_ids":["https://openalex.org/I36258959"],"apc_list":null,"apc_paid":null,"fwci":0.1199,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.41665478,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"4665","last_page":"4673"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9911999702453613,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9829704165458679},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8690014481544495},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7172473073005676},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.627018928527832},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5609888434410095},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.48956379294395447},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.4769909977912903},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.413998544216156},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3711014688014984},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3483318090438843}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9829704165458679},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8690014481544495},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7172473073005676},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.627018928527832},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5609888434410095},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.48956379294395447},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.4769909977912903},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.413998544216156},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3711014688014984},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3483318090438843},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3503161.3548150","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3503161.3548150","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3548150","source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3503161.3548150","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3503161.3548150","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3548150","source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/11","display_name":"Sustainable cities and communities","score":0.5199999809265137}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4304086156.pdf","grobid_xml":"https://content.openalex.org/works/W4304086156.grobid-xml"},"referenced_works_count":30,"referenced_works":["https://openalex.org/W1593271688","https://openalex.org/W1861492603","https://openalex.org/W1895577753","https://openalex.org/W1905882502","https://openalex.org/W1923211482","https://openalex.org/W1956340063","https://openalex.org/W2108598243","https://openalex.org/W2277195237","https://openalex.org/W2302086703","https://openalex.org/W2506483933","https://openalex.org/W2552161745","https://openalex.org/W2745461083","https://openalex.org/W2885013662","https://openalex.org/W2890531016","https://openalex.org/W2963084599","https://openalex.org/W2963101956","https://openalex.org/W2964081807","https://openalex.org/W2965658867","https://openalex.org/W2965697393","https://openalex.org/W2971302065","https://openalex.org/W2986670728","https://openalex.org/W3034655362","https://openalex.org/W3035284526","https://openalex.org/W3091588028","https://openalex.org/W3099884890","https://openalex.org/W3127216151","https://openalex.org/W3172317602","https://openalex.org/W3173220247","https://openalex.org/W3174346308","https://openalex.org/W4210778442"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W3164229987","https://openalex.org/W3215212336","https://openalex.org/W4290852288","https://openalex.org/W3122720459","https://openalex.org/W4298897568","https://openalex.org/W3217388757","https://openalex.org/W1938708284","https://openalex.org/W4380190185","https://openalex.org/W3161816943"],"abstract_inverted_index":{"In":[0,39,133],"deep":[1,32],"learning":[2],"applications,":[3],"image":[4,15,40,68,120,138,153,165,211,230],"understanding":[5,231],"is":[6,73],"a":[7,66,74,103,163],"crucial":[8],"task,":[9],"where":[10,46],"several":[11],"techniques":[12],"such":[13],"as":[14],"captioning":[16,69,121,139,166],"and":[17,27,55,90,142,149,171,195,206,224],"visual":[18],"question":[19],"answering":[20],"have":[21,43],"been":[22],"widely":[23],"studied":[24],"to":[25,61,78,93,113,151,180],"improve":[26],"evaluate":[28],"the":[29,47,50,59,79,83,87,115,134,144,156,159,172,177,185,192,197,202,209,222],"performances":[30],"of":[31,81,86,146],"neural":[33],"networks":[34],"(DNN)":[35],"in":[36,229],"this":[37,99],"area.":[38],"captioning,":[41],"models":[42],"encoder-decoder":[44,70,161,198],"architectures,":[45],"encoders":[48],"take":[49],"input":[51,88],"images,":[52,170],"produce":[53],"embeddings,":[54],"feed":[56],"them":[57],"into":[58],"decoders":[60],"generate":[62,94],"textual":[63],"descriptions.":[64,96],"Designing":[65],"proper":[67],"architecture":[71,110,118,199],"manually":[72],"difficult":[75],"challenge":[76],"due":[77],"complexity":[80],"recognizing":[82],"critical":[84],"objects":[85],"images":[89],"their":[91],"relationships":[92],"caption":[95],"To":[97],"address":[98],"issue,":[100],"we":[101],"propose":[102],"three-level":[104],"optimization":[105,124],"method":[106,217],"that":[107,215],"employs":[108],"differentiable":[109],"search":[111],"strategies":[112],"seek":[114],"most":[116],"suitable":[117],"for":[119],"automatically.":[122],"Our":[123],"framework":[125],"involves":[126],"three":[127],"stages,":[128],"which":[129],"are":[130],"performed":[131],"end-to-end.":[132],"first":[135],"stage,":[136,158],"an":[137],"model":[140,174,187],"learns":[141],"updates":[143,196],"weights":[145],"its":[147,182,189],"encoder":[148],"decoder":[150],"create":[152],"captions.":[154],"At":[155],"next":[157],"trained":[160,186],"generates":[162],"pseudo":[164],"dataset":[167,179],"from":[168],"unlabeled":[169],"predictive":[173],"trains":[175],"on":[176,191,208],"generated":[178],"update":[181],"weights.":[183],"Finally,":[184],"validates":[188],"performance":[190],"validation":[193,203],"set":[194],"by":[200],"minimizing":[201],"loss.":[204],"Experiments":[205],"studies":[207],"COCO":[210],"captions":[212],"datasets":[213],"demonstrate":[214],"our":[216],"performs":[218],"significantly":[219],"better":[220],"than":[221],"baselines":[223],"can":[225],"achieve":[226],"state-of-the-art":[227],"results":[228],"tasks.":[232]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
