{"id":"https://openalex.org/W3196916587","doi":"https://doi.org/10.1145/3460426.3463610","title":"Learning Hierarchical Visual-Semantic Representation with Phrase Alignment","display_name":"Learning Hierarchical Visual-Semantic Representation with Phrase Alignment","publication_year":2021,"publication_date":"2021-08-24","ids":{"openalex":"https://openalex.org/W3196916587","doi":"https://doi.org/10.1145/3460426.3463610","mag":"3196916587"},"language":"en","primary_location":{"id":"doi:10.1145/3460426.3463610","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3460426.3463610","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5005499406","display_name":"Baoming Yan","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Baoming Yan","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091406620","display_name":"Qingheng Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qingheng Zhang","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100698216","display_name":"Liyu Chen","orcid":"https://orcid.org/0000-0002-0589-6032"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liyu Chen","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100403197","display_name":"Lin Wang","orcid":"https://orcid.org/0000-0002-7779-5177"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lin Wang","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090519741","display_name":"Leihao Pei","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Leihao Pei","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102021236","display_name":"Jiang Yang","orcid":"https://orcid.org/0000-0002-6431-7483"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiang Yang","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054552826","display_name":"Enyun Yu","orcid":"https://orcid.org/0009-0009-0847-7464"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Enyun Yu","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100318755","display_name":"Xiaobo Li","orcid":"https://orcid.org/0000-0002-8074-0230"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaobo Li","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5012687475","display_name":"Binqiang Zhao","orcid":"https://orcid.org/0009-0003-3990-6694"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Binqiang Zhao","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5005499406"],"corresponding_institution_ids":["https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.10519608,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"349","last_page":"357"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9944000244140625,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.849333643913269},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6408509612083435},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6060604453086853},{"id":"https://openalex.org/keywords/phrase","display_name":"Phrase","score":0.577465832233429},{"id":"https://openalex.org/keywords/visual-word","display_name":"Visual Word","score":0.5481052398681641},{"id":"https://openalex.org/keywords/explicit-semantic-analysis","display_name":"Explicit semantic analysis","score":0.5076674222946167},{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.49624353647232056},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.48271510004997253},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.47949081659317017},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4509545862674713},{"id":"https://openalex.org/keywords/semantic-computing","display_name":"Semantic computing","score":0.37934133410453796},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3684607446193695},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.3170088231563568},{"id":"https://openalex.org/keywords/semantic-technology","display_name":"Semantic technology","score":0.10463675856590271},{"id":"https://openalex.org/keywords/semantic-web","display_name":"Semantic Web","score":0.08985534310340881}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.849333643913269},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6408509612083435},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6060604453086853},{"id":"https://openalex.org/C2776224158","wikidata":"https://www.wikidata.org/wiki/Q187931","display_name":"Phrase","level":2,"score":0.577465832233429},{"id":"https://openalex.org/C189391414","wikidata":"https://www.wikidata.org/wiki/Q7936579","display_name":"Visual Word","level":4,"score":0.5481052398681641},{"id":"https://openalex.org/C173862523","wikidata":"https://www.wikidata.org/wiki/Q5421270","display_name":"Explicit semantic analysis","level":5,"score":0.5076674222946167},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.49624353647232056},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.48271510004997253},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.47949081659317017},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4509545862674713},{"id":"https://openalex.org/C511149849","wikidata":"https://www.wikidata.org/wiki/Q7449051","display_name":"Semantic computing","level":3,"score":0.37934133410453796},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3684607446193695},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3170088231563568},{"id":"https://openalex.org/C6881194","wikidata":"https://www.wikidata.org/wiki/Q7449091","display_name":"Semantic technology","level":4,"score":0.10463675856590271},{"id":"https://openalex.org/C2129575","wikidata":"https://www.wikidata.org/wiki/Q54837","display_name":"Semantic Web","level":2,"score":0.08985534310340881},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3460426.3463610","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3460426.3463610","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.41999998688697815}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2185175083","https://openalex.org/W2333091651","https://openalex.org/W2546696630","https://openalex.org/W2613718673","https://openalex.org/W2956018683","https://openalex.org/W2962964995","https://openalex.org/W2963040148","https://openalex.org/W2963403868","https://openalex.org/W2964120214","https://openalex.org/W2965848243","https://openalex.org/W2979304729","https://openalex.org/W2981448908","https://openalex.org/W2981586349","https://openalex.org/W2982078236","https://openalex.org/W2988823324","https://openalex.org/W2999905431","https://openalex.org/W3035552787","https://openalex.org/W6603860191"],"related_works":["https://openalex.org/W2548806402","https://openalex.org/W64345524","https://openalex.org/W1974970223","https://openalex.org/W2366482673","https://openalex.org/W2540793605","https://openalex.org/W2151414079","https://openalex.org/W2083396186","https://openalex.org/W2545852610","https://openalex.org/W2352508935","https://openalex.org/W1827099466"],"abstract_inverted_index":{"Effective":[0],"visual-semantic":[1],"representation":[2,17,133],"is":[3,47,143],"critical":[4],"to":[5,14,69,106,155,229],"the":[6,31,44,51,71,78,93,115,120,141,157,179,231,234],"image-text":[7,52,149],"matching":[8,53],"task.":[9,54],"Various":[10],"methods":[11],"are":[12,227],"proposed":[13,235],"develop":[15,124],"image":[16,37,117,190,196,210],"with":[18,65,145,189,198],"more":[19],"semantic":[20,45,67,81,90,111],"concepts":[21,91,112],"and":[22,38,85,113,151,162,171,192,215,223],"a":[23,60],"lot":[24],"of":[25,148,233],"progress":[26],"has":[27],"been":[28],"achieved.":[29],"However,":[30],"internal":[32],"hierarchical":[33,72],"structure":[34],"in":[35,50,186],"both":[36],"text,":[39],"which":[40],"could":[41],"effectively":[42],"enhance":[43],"representation,":[46],"rarely":[48],"explored":[49],"In":[55],"this":[56],"work,":[57],"we":[58,75,100,123],"propose":[59],"Hierarchical":[61],"Visual-Semantic":[62],"Network":[63],"(HVSN)":[64],"fine-grained":[66,158],"alignment":[68,153],"exploit":[70],"structure.":[73],"Specifically,":[74],"first":[76],"model":[77,142],"spatial":[79],"or":[80],"relationship":[82],"between":[83,109,137,160],"objects":[84],"aggregate":[86],"them":[87],"into":[88],"visual":[89,110,224],"by":[92,134,183,212,218],"Local":[94],"Relational":[95],"Attention":[96],"(LRA)":[97],"module.":[98],"Then":[99],"employ":[101],"Gated":[102],"Recurrent":[103],"Unit":[104],"(GRU)":[105],"learn":[107],"relationships":[108,136],"generate":[114,131],"global":[116],"representation.":[118],"For":[119],"text":[121,132,187,199,216],"part,":[122],"phrase":[125,152],"features":[126],"from":[127],"related":[128],"words,":[129],"then":[130],"learning":[135],"these":[138],"phrases.":[139],"Besides,":[140],"trained":[144],"joint":[146],"optimization":[147],"retrieval":[150,188,197,211,217],"task":[154],"capture":[156],"interplay":[159],"vision":[161],"language.":[163],"Our":[164],"approach":[165,177],"achieves":[166],"state-of-the-art":[167,181],"performance":[168],"on":[169,202],"Flickr30K":[170],"MS-COCO":[172],"datasets.":[173],"On":[174,205],"Flickr30K,":[175],"our":[176,207],"outperforms":[178],"current":[180],"method":[182],"3.9%":[184],"relatively":[185,194,214],"query":[191,200],"1.3%":[193],"for":[195],"(based":[201],"[email":[203],"protected]).":[204],"MS-COCO,":[206],"HVSN":[208],"improves":[209],"2.3%":[213],"1.2%":[219],"relatively.":[220],"Both":[221],"quantitative":[222],"ablation":[225],"studies":[226],"provided":[228],"verify":[230],"effectiveness":[232],"modules.":[236]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
