{"id":"https://openalex.org/W4399435609","doi":"https://doi.org/10.1145/3652583.3658005","title":"Pyramidal Cross-Modal Transformer with Sustained Visual Guidance for Multi-Label Image Classification","display_name":"Pyramidal Cross-Modal Transformer with Sustained Visual Guidance for Multi-Label Image Classification","publication_year":2024,"publication_date":"2024-05-30","ids":{"openalex":"https://openalex.org/W4399435609","doi":"https://doi.org/10.1145/3652583.3658005"},"language":"en","primary_location":{"id":"doi:10.1145/3652583.3658005","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3652583.3658005","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3652583.3658005","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3652583.3658005","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5017388107","display_name":"Zhuohua Li","orcid":"https://orcid.org/0000-0002-4424-8249"},"institutions":[{"id":"https://openalex.org/I4210156404","display_name":"Institute of Information Engineering","ror":"https://ror.org/04r53se39","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210156404"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhuohua Li","raw_affiliation_strings":["Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-4424-8249","affiliations":[{"raw_affiliation_string":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210156404","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009747205","display_name":"Ruyun Wang","orcid":"https://orcid.org/0000-0002-7113-3017"},"institutions":[{"id":"https://openalex.org/I4210156404","display_name":"Institute of Information Engineering","ror":"https://ror.org/04r53se39","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210156404"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruyun Wang","raw_affiliation_strings":["Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-7113-3017","affiliations":[{"raw_affiliation_string":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210156404","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029956410","display_name":"Fuqing Zhu","orcid":"https://orcid.org/0000-0001-7061-3329"},"institutions":[{"id":"https://openalex.org/I4210156404","display_name":"Institute of Information Engineering","ror":"https://ror.org/04r53se39","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210156404"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fuqing Zhu","raw_affiliation_strings":["Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-7061-3329","affiliations":[{"raw_affiliation_string":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210156404","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082300353","display_name":"Jizhong Han","orcid":"https://orcid.org/0000-0003-1107-3873"},"institutions":[{"id":"https://openalex.org/I4210156404","display_name":"Institute of Information Engineering","ror":"https://ror.org/04r53se39","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210156404"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jizhong Han","raw_affiliation_strings":["Institute of Information Engineering, Chinese Academy of Science, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-1107-3873","affiliations":[{"raw_affiliation_string":"Institute of Information Engineering, Chinese Academy of Science, Beijing, China","institution_ids":["https://openalex.org/I4210156404"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018288181","display_name":"Songlin Hu","orcid":"https://orcid.org/0000-0002-7170-3809"},"institutions":[{"id":"https://openalex.org/I4210156404","display_name":"Institute of Information Engineering","ror":"https://ror.org/04r53se39","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210156404"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Songlin Hu","raw_affiliation_strings":["Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-7170-3809","affiliations":[{"raw_affiliation_string":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210156404","https://openalex.org/I4210165038"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5017388107"],"corresponding_institution_ids":["https://openalex.org/I4210156404","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":0.9523,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.75251465,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"740","last_page":"748"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.989799976348877,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9865999817848206,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7667685747146606},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6258236169815063},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.529839038848877},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.5295996069908142},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5246548652648926},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5025928020477295},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.4748407006263733},{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.4167846143245697},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3226652145385742}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7667685747146606},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6258236169815063},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.529839038848877},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.5295996069908142},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5246548652648926},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5025928020477295},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.4748407006263733},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.4167846143245697},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3226652145385742},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3652583.3658005","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3652583.3658005","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3652583.3658005","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3652583.3658005","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3652583.3658005","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3652583.3658005","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4399435609.pdf"},"referenced_works_count":32,"referenced_works":["https://openalex.org/W1536680647","https://openalex.org/W1567302070","https://openalex.org/W1861492603","https://openalex.org/W2194775991","https://openalex.org/W2560096627","https://openalex.org/W2618530766","https://openalex.org/W2932399282","https://openalex.org/W2963052338","https://openalex.org/W2963300078","https://openalex.org/W2963745697","https://openalex.org/W2963875806","https://openalex.org/W2982112268","https://openalex.org/W2998420437","https://openalex.org/W2999565440","https://openalex.org/W3035665101","https://openalex.org/W3099518117","https://openalex.org/W3109067582","https://openalex.org/W3138516171","https://openalex.org/W3167456680","https://openalex.org/W3171625335","https://openalex.org/W3174320978","https://openalex.org/W3174396085","https://openalex.org/W3205618423","https://openalex.org/W4214673031","https://openalex.org/W4221079634","https://openalex.org/W4304014049","https://openalex.org/W4304080876","https://openalex.org/W4312847199","https://openalex.org/W4367847529","https://openalex.org/W4379791025","https://openalex.org/W4379806189","https://openalex.org/W4390874143"],"related_works":["https://openalex.org/W2068608913","https://openalex.org/W3124914020","https://openalex.org/W4390516098","https://openalex.org/W2141033859","https://openalex.org/W2156434174","https://openalex.org/W2071701083","https://openalex.org/W2383687187","https://openalex.org/W2181948922","https://openalex.org/W2081517010","https://openalex.org/W2121496884"],"abstract_inverted_index":{"Multi-label":[0],"image":[1],"classification":[2],"poses":[3],"a":[4,92,111],"formidable":[5],"challenge":[6],"due":[7],"to":[8,21,42,66,120,143,177],"the":[9,23,32,69,76,101,107,116,129,136,146,174,179],"presence":[10],"of":[11,34,79,132,165,181],"multiple":[12,29],"objects":[13,30,62],"in":[14],"each":[15],"image,":[16],"rendering":[17],"it":[18,52],"notably":[19],"complex":[20],"decipher":[22],"visual":[24,36,86,103,108,150,183],"content":[25],"comprehensively.":[26],"Discriminating":[27],"between":[28,149],"necessitates":[31],"establishment":[33],"robust":[35],"label":[37,80,87,125,152,184],"dependencies.":[38,81,185],"Previous":[39],"methods":[40],"attempt":[41],"formulate":[43],"cross-modal":[44],"interaction":[45,139],"or":[46,60],"one-shot":[47],"co-occurrence":[48,133],"relationship":[49],"guidance.":[50],"However,":[51],"not":[53],"only":[54],"exhibits":[55],"limitations":[56],"when":[57],"handling":[58],"occluded":[59],"blurry":[61],"but":[63],"also":[64],"fails":[65],"fully":[67],"leverage":[68],"diverse":[70],"hierarchical":[71,85],"properties":[72],"for":[73,97,124],"sustainably":[74,83],"guiding":[75],"learning":[77],"process":[78],"To":[82],"establish":[84],"dependencies,":[88],"this":[89],"paper":[90],"introduces":[91],"Pyramidal":[93],"Cross-modal":[94],"Transformer":[95],"framework":[96],"MLIC":[98],"tasks.":[99],"Specifically,":[100],"pyramidal":[102],"guidance":[104,123],"layer":[105,140],"parses":[106],"features":[109],"into":[110],"multi-resolution":[112],"pyramid":[113],"structure,":[114],"allowing":[115],"updated":[117],"visual-related":[118],"information":[119,153],"provide":[121,203],"sustained":[122],"semantics.":[126],"This":[127],"surpasses":[128],"conventional":[130],"pre-processing":[131],"relationships.":[134],"Besides,":[135],"hybrid":[137],"modal":[138],"is":[141],"proposed":[142],"effectively":[144],"mitigate":[145],"semantic":[147],"disparities":[148],"and":[151,171,194],"with":[154],"modal-blended":[155],"indiscriminate":[156],"attention,":[157],"replacing":[158],"vanilla":[159],"self-attention.":[160],"Several":[161],"combination":[162],"blocks":[163],"consisting":[164],"these":[166],"two":[167,189],"layers":[168],"are":[169],"integrated":[170],"embedded":[172],"within":[173],"encoder-decoder":[175],"structure":[176],"facilitate":[178],"exploration":[180],"meticulous":[182],"Extensive":[186],"experiments":[187],"on":[188],"widely-used":[190],"benchmarks,":[191],"including":[192],"MS-COCO":[193],"PASCAL":[195],"VOC":[196],"2007,":[197],"consistently":[198],"demonstrate":[199],"that":[200],"PCMT":[201],"could":[202],"state-of-the-art":[204],"results.":[205]},"counts_by_year":[{"year":2025,"cited_by_count":4}],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
