{"id":"https://openalex.org/W7164819719","doi":"https://doi.org/10.1145/3805622.3810770","title":"CodeMNER: Vision-Language Models are Better Multimodal Named Entity Recognizers via Progressive Vision-Code Alignment","display_name":"CodeMNER: Vision-Language Models are Better Multimodal Named Entity Recognizers via Progressive Vision-Code Alignment","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164819719","doi":"https://doi.org/10.1145/3805622.3810770"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810770","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810770","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810770","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123392491","display_name":"Jiakang Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I31590910","display_name":"Jianghan University","ror":"https://ror.org/041c9x778","country_code":"CN","type":"education","lineage":["https://openalex.org/I31590910"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiakang Yu","raw_affiliation_strings":["Jianghan University, Wuhan, China"],"raw_orcid":"https://orcid.org/0009-0005-3183-2576","affiliations":[{"raw_affiliation_string":"Jianghan University, Wuhan, China","institution_ids":["https://openalex.org/I31590910"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087643562","display_name":"Shizhou Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shizhou Huang","raw_affiliation_strings":["East China Normal University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0004-2057-5271","affiliations":[{"raw_affiliation_string":"East China Normal University, Shanghai, China","institution_ids":["https://openalex.org/I66867065"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138655600","display_name":"Xiaode Chen","orcid":"https://orcid.org/0009-0005-3752-6537"},"institutions":[{"id":"https://openalex.org/I31590910","display_name":"Jianghan University","ror":"https://ror.org/041c9x778","country_code":"CN","type":"education","lineage":["https://openalex.org/I31590910"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaode Chen","raw_affiliation_strings":["Jianghan University, Wuhan, China"],"raw_orcid":"https://orcid.org/0009-0005-3752-6537","affiliations":[{"raw_affiliation_string":"Jianghan University, Wuhan, China","institution_ids":["https://openalex.org/I31590910"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138674737","display_name":"Hongtao Deng","orcid":"https://orcid.org/0000-0001-6910-499X"},"institutions":[{"id":"https://openalex.org/I31590910","display_name":"Jianghan University","ror":"https://ror.org/041c9x778","country_code":"CN","type":"education","lineage":["https://openalex.org/I31590910"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongtao Deng","raw_affiliation_strings":["Jianghan University, Wuhan, China"],"raw_orcid":"https://orcid.org/0000-0001-6910-499X","affiliations":[{"raw_affiliation_string":"Jianghan University, Wuhan, China","institution_ids":["https://openalex.org/I31590910"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073847459","display_name":"Wang Gao","orcid":"https://orcid.org/0000-0001-9671-489X"},"institutions":[{"id":"https://openalex.org/I31590910","display_name":"Jianghan University","ror":"https://ror.org/041c9x778","country_code":"CN","type":"education","lineage":["https://openalex.org/I31590910"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wang Gao","raw_affiliation_strings":["Jianghan University, Wuhan, China"],"raw_orcid":"https://orcid.org/0000-0001-9671-489X","affiliations":[{"raw_affiliation_string":"Jianghan University, Wuhan, China","institution_ids":["https://openalex.org/I31590910"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016914537","display_name":"Xun Zhu","orcid":"https://orcid.org/0000-0002-5143-6774"},"institutions":[{"id":"https://openalex.org/I31590910","display_name":"Jianghan University","ror":"https://ror.org/041c9x778","country_code":"CN","type":"education","lineage":["https://openalex.org/I31590910"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xun Zhu","raw_affiliation_strings":["Jianghan University, Wuhan, China"],"raw_orcid":"https://orcid.org/0000-0002-5143-6774","affiliations":[{"raw_affiliation_string":"Jianghan University, Wuhan, China","institution_ids":["https://openalex.org/I31590910"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93525262,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"497","last_page":"505"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7853999733924866,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7853999733924866,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.16019999980926514,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.014999999664723873,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.791100025177002},{"id":"https://openalex.org/keywords/structured-prediction","display_name":"Structured prediction","score":0.5819000005722046},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5788000226020813},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.544700026512146},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.48420000076293945},{"id":"https://openalex.org/keywords/named-entity-recognition","display_name":"Named-entity recognition","score":0.47920000553131104},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.40950000286102295},{"id":"https://openalex.org/keywords/code-generation","display_name":"Code generation","score":0.35530000925064087},{"id":"https://openalex.org/keywords/natural-language-generation","display_name":"Natural language generation","score":0.3330000042915344}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.864799976348877},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.791100025177002},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.5819000005722046},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5788000226020813},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.544700026512146},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5318999886512756},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.48420000076293945},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4814000129699707},{"id":"https://openalex.org/C2779135771","wikidata":"https://www.wikidata.org/wiki/Q403574","display_name":"Named-entity recognition","level":3,"score":0.47920000553131104},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.40950000286102295},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3718999922275543},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.35530000925064087},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.3330000042915344},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.3206000030040741},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.31769999861717224},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.304500013589859},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.2969000041484833},{"id":"https://openalex.org/C96711827","wikidata":"https://www.wikidata.org/wiki/Q17012245","display_name":"Entity linking","level":3,"score":0.2919999957084656},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.29019999504089355},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27799999713897705},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.2632000148296356},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.258899986743927}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810770","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810770","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810770","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810770","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.6934961080551147}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2788647998","https://openalex.org/W2798298921","https://openalex.org/W3035448883","https://openalex.org/W3100180738","https://openalex.org/W3127151332","https://openalex.org/W3176858586","https://openalex.org/W3212769596","https://openalex.org/W4212998232","https://openalex.org/W4221166835","https://openalex.org/W4225757376","https://openalex.org/W4287854428","https://openalex.org/W4304091743","https://openalex.org/W4321488427","https://openalex.org/W4365802806","https://openalex.org/W4385572217","https://openalex.org/W4387848810","https://openalex.org/W4391647588","https://openalex.org/W4393160048","https://openalex.org/W4393160905","https://openalex.org/W4393161236","https://openalex.org/W4402671784","https://openalex.org/W4411635706","https://openalex.org/W4412887996","https://openalex.org/W4412888717","https://openalex.org/W4414360265","https://openalex.org/W4415708228"],"related_works":[],"abstract_inverted_index":{"With":[0],"the":[1,42,92,105,115,119,126,143,176],"explosive":[2],"growth":[3],"of":[4,87,99,118],"multimedia":[5],"content":[6],"on":[7,24,185],"social":[8],"media,":[9],"Multimodal":[10],"Named":[11],"Entity":[12],"Recognition":[13],"(MNER)":[14],"has":[15],"garnered":[16],"significant":[17],"attention.":[18],"However,":[19],"current":[20],"paradigms":[21],"predominantly":[22],"rely":[23],"general":[25],"Vision-Language":[26],"Models":[27],"(VLMs)":[28],"to":[29,39,140,148],"generate":[30],"natural":[31,88,134],"language":[32],"responses.":[33],"Such":[34],"unstructured":[35],"text":[36],"generation":[37,80,121],"struggles":[38],"precisely":[40],"articulate":[41],"complex":[43],"structured":[44,130],"information":[45],"inherent":[46,93,177],"in":[47,52],"MNER":[48,74],"tasks,":[49],"often":[50],"resulting":[51],"outputs":[53],"that":[54,72,196],"lack":[55,125],"logical":[56],"rigor":[57,95],"and":[58,96,110,133,170,181,192],"explicit":[59],"structural":[60],"constraints.":[61],"To":[62,151],"address":[63],"these":[64],"limitations,":[65],"we":[66,154],"propose":[67],"CodeMNER,":[68],"a":[69,77,156],"novel":[70],"framework":[71],"reformulates":[73],"tasks":[75],"as":[76],"multimodal":[78],"code":[79,85,120,131],"problem.":[81],"By":[82],"synthesizing":[83],"executable":[84,149],"instead":[86],"language,":[89],"CodeMNER":[90,197],"leverages":[91],"syntactic":[94],"deterministic":[97],"executability":[98],"programming":[100],"languages,":[101],"thereby":[102],"significantly":[103],"enhancing":[104],"model\u2019s":[106],"capacity":[107],"for":[108],"identifying":[109],"classifying":[111],"named":[112],"entities.":[113],"Despite":[114],"evident":[116],"advantages":[117],"paradigm,":[122],"standard":[123,190],"VLMs":[124],"joint":[127],"alignment":[128,179],"between":[129],"semantics":[132],"visual":[135,146],"representations,":[136],"making":[137],"it":[138],"challenging":[139],"directly":[141],"establish":[142],"mapping":[144],"from":[145],"contexts":[147],"code.":[150],"this":[152],"end,":[153],"design":[155],"progressive":[157],"four-stage":[158],"training":[159],"pipeline,":[160],"encompassing":[161],"mid-training,":[162],"supervised":[163],"fine-tuning,":[164],"reinforcement":[165],"learning":[166],"with":[167],"verifiable":[168],"rewards,":[169],"downstream":[171],"adaptation.":[172],"This":[173],"pipeline":[174],"bridges":[175],"vision-code":[178],"gap":[180],"augments":[182],"model":[183],"performance":[184],"MNER.":[186],"Extensive":[187],"experiments":[188],"across":[189],"Twitter-2015":[191],"Twitter-2017":[193],"datasets":[194],"demonstrate":[195],"achieves":[198],"state-of-the-art":[199],"performance,":[200],"surpassing":[201],"existing":[202],"baselines.":[203]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
