{"id":"https://openalex.org/W4416249463","doi":"https://doi.org/10.1109/ijcnn64981.2025.11228908","title":"DLLaVA: A Novel Multimodal Architecture with Enhanced Vision Encoder and Curriculum Learning","display_name":"DLLaVA: A Novel Multimodal Architecture with Enhanced Vision Encoder and Curriculum Learning","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4416249463","doi":"https://doi.org/10.1109/ijcnn64981.2025.11228908"},"language":null,"primary_location":{"id":"doi:10.1109/ijcnn64981.2025.11228908","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11228908","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101795773","display_name":"Y. Mao","orcid":"https://orcid.org/0000-0003-3184-7733"},"institutions":[{"id":"https://openalex.org/I205237279","display_name":"Nankai University","ror":"https://ror.org/01y1kjr75","country_code":"CN","type":"education","lineage":["https://openalex.org/I205237279"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yuqi Mao","raw_affiliation_strings":["Nankai University,College of Software,Tianjin,China"],"affiliations":[{"raw_affiliation_string":"Nankai University,College of Software,Tianjin,China","institution_ids":["https://openalex.org/I205237279"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5056271944","display_name":"Jianyu Zhou","orcid":"https://orcid.org/0000-0001-5175-9437"},"institutions":[{"id":"https://openalex.org/I205237279","display_name":"Nankai University","ror":"https://ror.org/01y1kjr75","country_code":"CN","type":"education","lineage":["https://openalex.org/I205237279"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianyu Zhou","raw_affiliation_strings":["Nankai University,College of Software,Tianjin,China"],"affiliations":[{"raw_affiliation_string":"Nankai University,College of Software,Tianjin,China","institution_ids":["https://openalex.org/I205237279"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5101795773"],"corresponding_institution_ids":["https://openalex.org/I205237279"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.37184702,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8439000248908997,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8439000248908997,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.04699999839067459,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.013899999670684338,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.6057999730110168},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5616000294685364},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.521399974822998},{"id":"https://openalex.org/keywords/multimodal-learning","display_name":"Multimodal learning","score":0.4726000130176544},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.4456999897956848},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4302999973297119},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.40720000863075256},{"id":"https://openalex.org/keywords/curriculum","display_name":"Curriculum","score":0.36970001459121704}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.761900007724762},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.6057999730110168},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5616000294685364},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.548799991607666},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.521399974822998},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.4726000130176544},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4602000117301941},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.4456999897956848},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4302999973297119},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.40720000863075256},{"id":"https://openalex.org/C47177190","wikidata":"https://www.wikidata.org/wiki/Q207137","display_name":"Curriculum","level":2,"score":0.36970001459121704},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.359499990940094},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.3472000062465668},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.3041999936103821},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3025999963283539},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3003999888896942},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.27140000462532043},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2635999917984009},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ijcnn64981.2025.11228908","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11228908","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W3120043490","https://openalex.org/W3142849873","https://openalex.org/W4285255856","https://openalex.org/W4400527399","https://openalex.org/W4401344631","https://openalex.org/W4402713111","https://openalex.org/W4402716477","https://openalex.org/W4402727764","https://openalex.org/W4403081466","https://openalex.org/W4404575065","https://openalex.org/W4405399726","https://openalex.org/W4412887834","https://openalex.org/W4415798312"],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"large":[1],"models":[2,72],"(MLLMs)":[3],"have":[4],"shown":[5],"strong":[6],"performance":[7],"in":[8,18,131,138,161],"language":[9,48],"tasks,":[10,153],"but":[11],"there":[12],"is":[13],"still":[14],"room":[15],"for":[16,158],"improvement":[17,130,137],"visual":[19,41,89,150],"capabilities.":[20,51],"To":[21,52],"address":[22],"this":[23],"challenge,":[24],"we":[25,58],"introduce":[26],"DLLaVA,":[27],"a":[28,64,85,93,127,135,155],"novel":[29],"model":[30,112],"that":[31,69,97,121],"integrates":[32],"Vision":[33],"Transformer":[34],"(ViT)":[35],"and":[36,49,91,110,134,151],"DINOv2":[37],"architectures":[38],"as":[39],"its":[40],"encoder,":[42],"effectively":[43],"bridging":[44],"the":[45,55,75,79,100,117],"gap":[46],"between":[47],"vision":[50],"further":[53],"optimize":[54],"model\u2019s":[56,80],"performance,":[57],"propose":[59],"two":[60],"key":[61],"innovations:":[62],"1)":[63],"unique":[65],"data":[66],"generation":[67],"pipeline":[68],"utilizes":[70],"closed-source":[71],"to":[73,82,147],"diversify":[74],"pre-training":[76],"dataset,":[77],"enhancing":[78],"ability":[81,146],"generalize":[83],"across":[84],"wide":[86],"range":[87],"of":[88,102],"contexts;":[90],"2)":[92],"curriculum":[94],"learning":[95,109],"strategy":[96],"gradually":[98],"scales":[99],"difficulty":[101],"tasks":[103],"during":[104],"training,":[105],"promoting":[106],"more":[107],"effective":[108],"improved":[111],"robustness.":[113],"Extensive":[114],"evaluation":[115],"on":[116],"MLLM":[118],"benchmark":[119,157],"shows":[120],"DLLaVA":[122],"outperforms":[123],"previous":[124],"methods,":[125],"achieving":[126],"6.56%":[128],"average":[129],"low-resolution":[132],"settings":[133],"2.53%":[136],"high-resolution":[139],"settings.":[140],"These":[141],"results":[142],"highlight":[143],"DLLaVA\u2019s":[144],"enhanced":[145],"handle":[148],"both":[149],"multimodal":[152,162],"setting":[154],"new":[156],"future":[159],"research":[160],"AI.":[163]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
