{"id":"https://openalex.org/W4304080179","doi":"https://doi.org/10.1145/3503161.3547776","title":"Unsupervised and Pseudo-Supervised Vision-Language Alignment in Visual Dialog","display_name":"Unsupervised and Pseudo-Supervised Vision-Language Alignment in Visual Dialog","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W4304080179","doi":"https://doi.org/10.1145/3503161.3547776"},"language":"en","primary_location":{"id":"doi:10.1145/3503161.3547776","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3503161.3547776","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3547776","source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3547776","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101667541","display_name":"Feilong Chen","orcid":"https://orcid.org/0000-0002-4860-8483"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Feilong Chen","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences &amp; University of CAS, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences &amp; University of CAS, Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077267369","display_name":"Duzhen Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Duzhen Zhang","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009070585","display_name":"Xiuyi Chen","orcid":"https://orcid.org/0000-0002-9351-4160"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiuyi Chen","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037866934","display_name":"Jing Shi","orcid":"https://orcid.org/0000-0001-5624-9698"},"institutions":[{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jing Shi","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004363895","display_name":"Shuang Xu","orcid":"https://orcid.org/0000-0003-3576-6914"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuang Xu","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5108642431","display_name":"Bo Xu","orcid":"https://orcid.org/0000-0002-1111-1529"},"institutions":[{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo XU","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101667541"],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210112150"],"apc_list":null,"apc_paid":null,"fwci":0.4797,"has_fulltext":true,"cited_by_count":8,"citation_normalized_percentile":{"value":0.73171253,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"4142","last_page":"4153"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9886999726295471,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9850999712944031,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dialog-box","display_name":"Dialog box","score":0.8932516574859619},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8370623588562012},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.7112776041030884},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5610872507095337},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.4815266728401184},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.44371482729911804},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.44204580783843994},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4418708086013794},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.32112666964530945},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.21957162022590637}],"concepts":[{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.8932516574859619},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8370623588562012},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7112776041030884},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5610872507095337},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.4815266728401184},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.44371482729911804},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.44204580783843994},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4418708086013794},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.32112666964530945},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.21957162022590637},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3503161.3547776","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3503161.3547776","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3547776","source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3503161.3547776","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3503161.3547776","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3547776","source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3080151772","display_name":null,"funder_award_id":"No.ZDBS-SSW-JSC006","funder_id":"https://openalex.org/F4320321133","funder_display_name":"Chinese Academy of Sciences"}],"funders":[{"id":"https://openalex.org/F4320321133","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4304080179.pdf","grobid_xml":"https://content.openalex.org/works/W4304080179.grobid-xml"},"referenced_works_count":47,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2091158010","https://openalex.org/W2108862644","https://openalex.org/W2277195237","https://openalex.org/W2619383789","https://openalex.org/W2745461083","https://openalex.org/W2886641317","https://openalex.org/W2962766617","https://openalex.org/W2962858109","https://openalex.org/W2963037989","https://openalex.org/W2963087868","https://openalex.org/W2963124051","https://openalex.org/W2963287297","https://openalex.org/W2964218959","https://openalex.org/W2966158321","https://openalex.org/W2967045987","https://openalex.org/W2970355596","https://openalex.org/W2981902456","https://openalex.org/W2982952731","https://openalex.org/W2986823309","https://openalex.org/W2987734933","https://openalex.org/W2990138404","https://openalex.org/W2995156524","https://openalex.org/W2997547717","https://openalex.org/W3000723049","https://openalex.org/W3004349648","https://openalex.org/W3034655362","https://openalex.org/W3034787499","https://openalex.org/W3035103424","https://openalex.org/W3035260401","https://openalex.org/W3090449556","https://openalex.org/W3091588028","https://openalex.org/W3095309002","https://openalex.org/W3107092117","https://openalex.org/W3116651605","https://openalex.org/W3138516171","https://openalex.org/W3173220247","https://openalex.org/W3173859428","https://openalex.org/W3175076935","https://openalex.org/W3176584016","https://openalex.org/W3213646008","https://openalex.org/W4207035468","https://openalex.org/W4224920405","https://openalex.org/W4236965008","https://openalex.org/W4249013746","https://openalex.org/W4304088351","https://openalex.org/W6601883942"],"related_works":["https://openalex.org/W2098987383","https://openalex.org/W3013693939","https://openalex.org/W2417260800","https://openalex.org/W2566616303","https://openalex.org/W1596203174","https://openalex.org/W2159052453","https://openalex.org/W2117933979","https://openalex.org/W3131327266","https://openalex.org/W3023366413","https://openalex.org/W3034808773"],"abstract_inverted_index":{"Visual":[0],"dialog":[1,63,71,138,167],"requires":[2],"models":[3],"to":[4,9,73,108,125],"give":[5],"reasonable":[6,123],"answers":[7],"according":[8],"a":[10,53,99,122,171],"series":[11],"of":[12,40,144,174],"coherent":[13],"questions":[14],"and":[15,56,70,76,89,94,117,147],"related":[16],"visual":[17,45,62,69,82,106,116,137,166],"concepts":[18,83],"in":[19,44],"images.":[20],"However,":[21],"most":[22],"current":[23],"work":[24],"either":[25],"focuses":[26],"on":[27,32,113,134,164],"attention-based":[28],"fusion":[29],"or":[30],"pre-training":[31],"large-scale":[33,136],"image-text":[34],"pairs,":[35],"ignoring":[36],"the":[37,68,114,126,129,142,165,177],"critical":[38],"role":[39],"explicit":[41],"vision-language":[42,58,91,145],"alignment":[43,59,92],"dialog.":[46],"To":[47],"remedy":[48],"this":[49],"defect,":[50],"we":[51],"propose":[52],"novel":[54],"unsupervised":[55,88],"pseudo-supervised":[57,90],"approach":[60],"for":[61],"(AlignVD).":[64],"Firstly,":[65],"AlginVD":[66],"utilizes":[67,98],"encoder":[72],"represent":[74],"images":[75],"dialogs.":[77],"Then,":[78],"it":[79],"explicitly":[80],"aligns":[81],"with":[84,170],"textual":[85,118],"semantics":[86],"via":[87,128],"(UVLA":[93],"PVLA).":[95],"Specifically,":[96],"UVLA":[97],"graph":[100],"autoencoder,":[101],"while":[102],"PVLA":[103],"uses":[104],"dialog-guided":[105],"grounding":[107],"conduct":[109],"alignment.":[110],"Finally,":[111],"based":[112],"aligned":[115],"representations,":[119],"AlignVD":[120,150],"gives":[121],"answer":[124],"question":[127],"cross-modal":[130],"decoder.":[131],"Extensive":[132],"experiments":[133],"two":[135],"datasets":[139],"have":[140],"demonstrated":[141],"effectiveness":[143],"alignment,":[146],"our":[148,157],"proposed":[149],"achieves":[151],"new":[152],"state-of-the-art":[153],"results.":[154],"In":[155],"addition,":[156],"single":[158],"model":[159,181],"has":[160],"won":[161],"first":[162],"place":[163],"challenge":[168],"leaderboard":[169],"NDCG":[172],"metric":[173],"78.70,":[175],"surpassing":[176],"previous":[178],"best":[179],"ensemble":[180],"by":[182],"about":[183],"1":[184],"point.":[185]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
