{"id":"https://openalex.org/W7138314995","doi":"https://doi.org/10.1609/aaai.v40i17.38468","title":"Augmenting Intra-Modal Understanding in MLLMs for Robust Multimodal Keyphrase Generation","display_name":"Augmenting Intra-Modal Understanding in MLLMs for Robust Multimodal Keyphrase Generation","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138314995","doi":"https://doi.org/10.1609/aaai.v40i17.38468"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i17.38468","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i17.38468","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i17.38468","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129721974","display_name":"Jiajun Cao","orcid":null},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiajun Cao","raw_affiliation_strings":["Xiamen University"],"affiliations":[{"raw_affiliation_string":"Xiamen University","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129704843","display_name":"Qinggang Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Qinggang Zhang","raw_affiliation_strings":["The Hong Kong Polytechnic University"],"affiliations":[{"raw_affiliation_string":"The Hong Kong Polytechnic University","institution_ids":["https://openalex.org/I14243506"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129698231","display_name":"Yunbo Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunbo Tang","raw_affiliation_strings":["Xiamen University"],"affiliations":[{"raw_affiliation_string":"Xiamen University","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100667249","display_name":"Zhiyi Xiang","orcid":null},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhishang Xiang","raw_affiliation_strings":["Xiamen University"],"affiliations":[{"raw_affiliation_string":"Xiamen University","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129737190","display_name":"Chang Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Chang Yang","raw_affiliation_strings":["The Hong Kong Polytechnic University"],"affiliations":[{"raw_affiliation_string":"The Hong Kong Polytechnic University","institution_ids":["https://openalex.org/I14243506"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129722019","display_name":"Jinsong Su","orcid":null},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinsong Su","raw_affiliation_strings":["Xiamen University\nShanghai Artificial Intelligence Laboratory"],"affiliations":[{"raw_affiliation_string":"Xiamen University\nShanghai Artificial Intelligence Laboratory","institution_ids":["https://openalex.org/I191208505"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5129721974"],"corresponding_institution_ids":["https://openalex.org/I191208505"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.52783726,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"17","first_page":"14511","last_page":"14519"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.7646999955177307,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.7646999955177307,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.148499995470047,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.031099999323487282,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7264999747276306},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.5925999879837036},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.5128999948501587},{"id":"https://openalex.org/keywords/structured-prediction","display_name":"Structured prediction","score":0.5001999735832214},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.46720001101493835},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.43470001220703125},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.4092999994754791},{"id":"https://openalex.org/keywords/semantic-feature","display_name":"Semantic feature","score":0.36899998784065247},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.3619000017642975}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8603000044822693},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7264999747276306},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.5925999879837036},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5834000110626221},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5221999883651733},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.5128999948501587},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.5001999735832214},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.46720001101493835},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.43470001220703125},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.4092999994754791},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.36899998784065247},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.3619000017642975},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.35929998755455017},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3467999994754791},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.32739999890327454},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3140000104904175},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3052999973297119},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.3043000102043152},{"id":"https://openalex.org/C2778827112","wikidata":"https://www.wikidata.org/wiki/Q22245680","display_name":"Feature engineering","level":3,"score":0.30140000581741333},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.29409998655319214},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2874000072479248},{"id":"https://openalex.org/C193125573","wikidata":"https://www.wikidata.org/wiki/Q7449065","display_name":"Semantic interpretation","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C2781170535","wikidata":"https://www.wikidata.org/wiki/Q30587856","display_name":"Noisy data","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.26980000734329224},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.2680000066757202},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2632000148296356},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.2619999945163727},{"id":"https://openalex.org/C2129575","wikidata":"https://www.wikidata.org/wiki/Q54837","display_name":"Semantic Web","level":2,"score":0.25999999046325684},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.257099986076355}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i17.38468","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i17.38468","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i17.38468","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i17.38468","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"keyphrase":[1,173],"generation":[2,174],"(MKP)":[3],"aims":[4],"to":[5,83],"extract":[6],"a":[7,84,109],"concise":[8],"set":[9],"of":[10,17,27,86],"keyphrases":[11],"that":[12,68,112,151],"capture":[13],"the":[14,31,43,161],"essential":[15],"meaning":[16],"paired":[18],"image\u2013text":[19],"inputs,":[20],"enabling":[21],"structured":[22],"understanding,":[23],"indexing,":[24],"and":[25,33,75,153,175],"retrieval":[26],"multimedia":[28,92],"data":[29,93],"across":[30,178],"web":[32],"social":[34],"platforms.":[35],"Success":[36],"in":[37,88,118,171],"this":[38,104],"task":[39],"demands":[40],"effectively":[41],"bridging":[42],"semantic":[44,116],"gap":[45],"between":[46],"heterogeneous":[47],"modalities.":[48,101],"While":[49],"multimodal":[50,172],"large":[51],"language":[52],"models":[53],"(MLLMs)":[54],"achieve":[55],"superior":[56],"cross-modal":[57,122,164],"understanding":[58],"by":[59,141],"leveraging":[60],"massive":[61],"pretraining":[62],"on":[63],"image-text":[64],"corpora,":[65],"we":[66,106],"observe":[67],"they":[69],"often":[70],"struggle":[71],"with":[72,97],"modality":[73,144],"bias":[74],"fine-grained":[76,135],"intra-modal":[77,115],"feature":[78,136],"extraction.":[79],"This":[80],"oversight":[81],"leads":[82],"lack":[85],"robustness":[87,177],"real-world":[89],"scenarios":[90],"where":[91],"is":[94],"noisy,":[95],"along":[96],"incomplete":[98],"or":[99],"misaligned":[100],"To":[102],"address":[103],"problem,":[105],"propose":[107],"AimKP,":[108],"novel":[110],"framework":[111],"explicitly":[113],"reinforces":[114],"learning":[117],"MLLMs":[119],"while":[120],"preserving":[121],"alignment.":[123],"AimKP":[124],"incorporates":[125],"two":[126],"core":[127,163],"innovations:":[128],"(i)":[129],"Progressive":[130],"Modality":[131],"Masking,":[132],"which":[133],"forces":[134],"extraction":[137],"from":[138,159],"corrupted":[139],"inputs":[140],"progressively":[142],"masking":[143],"information":[145],"during":[146],"training;":[147],"(ii)":[148],"Gradient-based":[149],"Filtering,":[150],"identifies":[152],"discards":[154],"noisy":[155],"samples,":[156],"preventing":[157],"them":[158],"corrupting":[160],"model\u2019s":[162],"learning.":[165],"Extensive":[166],"experiments":[167],"validate":[168],"AimKP\u2019s":[169],"effectiveness":[170],"its":[176],"different":[179],"scenarios.":[180]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
