{"id":"https://openalex.org/W7127418927","doi":"https://doi.org/10.1109/ism66958.2025.00052","title":"Extending Visual Dialog Beyond English: An Analysis of Monolingual and Multilingual Models","display_name":"Extending Visual Dialog Beyond English: An Analysis of Monolingual and Multilingual Models","publication_year":2025,"publication_date":"2025-12-08","ids":{"openalex":"https://openalex.org/W7127418927","doi":"https://doi.org/10.1109/ism66958.2025.00052"},"language":null,"primary_location":{"id":"doi:10.1109/ism66958.2025.00052","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ism66958.2025.00052","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Symposium on Multimedia (ISM)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002039299","display_name":"Milena M. Ad\u00e3o","orcid":null},"institutions":[{"id":"https://openalex.org/I170935008","display_name":"Pontif\u00edcia Universidade Cat\u00f3lica de Minas Gerais","ror":"https://ror.org/03j1rr444","country_code":"BR","type":"education","lineage":["https://openalex.org/I170935008"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Milena Menezes Ad\u00e3o","raw_affiliation_strings":["Pontif&#x00ED;cia Universidade Cat&#x00F3;lica de Minas Gerais (PUC Minas),IMScience Lab,Belo Horizonte,Brazil"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Pontif&#x00ED;cia Universidade Cat&#x00F3;lica de Minas Gerais (PUC Minas),IMScience Lab,Belo Horizonte,Brazil","institution_ids":["https://openalex.org/I170935008"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123565304","display_name":"Silvio Jamil F. Guimar\u00e3es","orcid":null},"institutions":[{"id":"https://openalex.org/I170935008","display_name":"Pontif\u00edcia Universidade Cat\u00f3lica de Minas Gerais","ror":"https://ror.org/03j1rr444","country_code":"BR","type":"education","lineage":["https://openalex.org/I170935008"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Silvio Jamil F. Guimar\u00e3es","raw_affiliation_strings":["Pontif&#x00ED;cia Universidade Cat&#x00F3;lica de Minas Gerais (PUC Minas),IMScience Lab,Belo Horizonte,Brazil"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Pontif&#x00ED;cia Universidade Cat&#x00F3;lica de Minas Gerais (PUC Minas),IMScience Lab,Belo Horizonte,Brazil","institution_ids":["https://openalex.org/I170935008"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5124966344","display_name":"Zenilton K. G. Patroc\u00ednio","orcid":null},"institutions":[{"id":"https://openalex.org/I170935008","display_name":"Pontif\u00edcia Universidade Cat\u00f3lica de Minas Gerais","ror":"https://ror.org/03j1rr444","country_code":"BR","type":"education","lineage":["https://openalex.org/I170935008"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Zenilton K. G. Patroc\u00ednio","raw_affiliation_strings":["Pontif&#x00ED;cia Universidade Cat&#x00F3;lica de Minas Gerais (PUC Minas),IMScience Lab,Belo Horizonte,Brazil"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Pontif&#x00ED;cia Universidade Cat&#x00F3;lica de Minas Gerais (PUC Minas),IMScience Lab,Belo Horizonte,Brazil","institution_ids":["https://openalex.org/I170935008"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.63736437,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"226","last_page":"233"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11148","display_name":"Language, Metaphor, and Cognition","score":0.0020000000949949026,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.0006000000284984708,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.66839998960495},{"id":"https://openalex.org/keywords/dialog-box","display_name":"Dialog box","score":0.6524999737739563},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5407000184059143},{"id":"https://openalex.org/keywords/portuguese","display_name":"Portuguese","score":0.38350000977516174},{"id":"https://openalex.org/keywords/brazilian-portuguese","display_name":"Brazilian Portuguese","score":0.38179999589920044},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.3702000081539154},{"id":"https://openalex.org/keywords/lexicon","display_name":"Lexicon","score":0.3587000072002411},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3319000005722046}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7235000133514404},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.66839998960495},{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.6524999737739563},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6403999924659729},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5561000108718872},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5407000184059143},{"id":"https://openalex.org/C35219183","wikidata":"https://www.wikidata.org/wiki/Q5146","display_name":"Portuguese","level":2,"score":0.38350000977516174},{"id":"https://openalex.org/C2778880076","wikidata":"https://www.wikidata.org/wiki/Q750553","display_name":"Brazilian Portuguese","level":3,"score":0.38179999589920044},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.3779999911785126},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.3702000081539154},{"id":"https://openalex.org/C2778121359","wikidata":"https://www.wikidata.org/wiki/Q8096","display_name":"Lexicon","level":2,"score":0.3587000072002411},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3319000005722046},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.32760000228881836},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3224000036716461},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.3131999969482422},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.3005000054836273},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2815999984741211},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.2700999975204468},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.26930001378059387},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.2547000050544739}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ism66958.2025.00052","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ism66958.2025.00052","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Symposium on Multimedia (ISM)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5688747763633728}],"awards":[{"id":"https://openalex.org/G1586990263","display_name":null,"funder_award_id":"APQ-01079-23,APQ-05058-23","funder_id":"https://openalex.org/F4320322980","funder_display_name":"Funda\u00e7\u00e3o de Amparo \u00e0 Pesquisa do Estado de Minas Gerais"},{"id":"https://openalex.org/G4608061574","display_name":null,"funder_award_id":"PROAP 88887.842889/2023-00 - PUC/MG,STIC-AMSUD 88887.878869/2023-00,23-STIC-10,001","funder_id":"https://openalex.org/F4320321091","funder_display_name":"Coordena\u00e7\u00e3o de Aperfei\u00e7oamento de Pessoal de N\u00edvel Superior"},{"id":"https://openalex.org/G5772736176","display_name":null,"funder_award_id":"407242/2021-0,306573/2022-9,442950/2023-3","funder_id":"https://openalex.org/F4320322025","funder_display_name":"Conselho Nacional de Desenvolvimento Cient\u00edfico e Tecnol\u00f3gico"}],"funders":[{"id":"https://openalex.org/F4320316054","display_name":"Pontif\u00edcia Universidade Cat\u00f3lica de Minas Gerais","ror":"https://ror.org/03j1rr444"},{"id":"https://openalex.org/F4320321091","display_name":"Coordena\u00e7\u00e3o de Aperfei\u00e7oamento de Pessoal de N\u00edvel Superior","ror":"https://ror.org/00x0ma614"},{"id":"https://openalex.org/F4320322025","display_name":"Conselho Nacional de Desenvolvimento Cient\u00edfico e Tecnol\u00f3gico","ror":"https://ror.org/03swz6y49"},{"id":"https://openalex.org/F4320322980","display_name":"Funda\u00e7\u00e3o de Amparo \u00e0 Pesquisa do Estado de Minas Gerais","ror":"https://ror.org/00nc55f03"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W1933349210","https://openalex.org/W2108862644","https://openalex.org/W2892245540","https://openalex.org/W2917061951","https://openalex.org/W2963287297","https://openalex.org/W2963341956","https://openalex.org/W2963623904","https://openalex.org/W2964204621","https://openalex.org/W2964218959","https://openalex.org/W2970231061","https://openalex.org/W2970355596","https://openalex.org/W2972324944","https://openalex.org/W2981902456","https://openalex.org/W2996781902","https://openalex.org/W3010694149","https://openalex.org/W3035398197","https://openalex.org/W3095309002","https://openalex.org/W3099143471","https://openalex.org/W3107092117","https://openalex.org/W3116651605","https://openalex.org/W3174010726","https://openalex.org/W3175076935","https://openalex.org/W3177654849","https://openalex.org/W3200050180","https://openalex.org/W4229019825","https://openalex.org/W4249013746","https://openalex.org/W4385245566","https://openalex.org/W4386066344"],"related_works":[],"abstract_inverted_index":{"Visual":[0,48],"Dialog":[1,49],"is":[2],"a":[3],"challenging":[4],"multimodal":[5,163],"task":[6],"requiring":[7],"models":[8,50,89,134,141],"to":[9,28,112,126],"answer":[10],"questions":[11],"about":[12],"images":[13],"through":[14],"multi-turn":[15],"conversations.":[16],"Despite":[17],"significant":[18],"progress,":[19],"research":[20],"has":[21],"predominantly":[22],"focused":[23],"on":[24,66],"English,":[25],"limiting":[26],"applicability":[27],"the":[29,40],"850+":[30],"million":[31],"speakers":[32],"of":[33,44],"Portuguese":[34,52],"and":[35,46,53,106,158],"Spanish":[36],"worldwide.":[37],"We":[38,130],"present":[39],"first":[41],"comprehensive":[42],"study":[43],"monolingual":[45,88,133],"multilingual":[47,82,140],"for":[51,76,79,153],"Spanish,":[54],"introducing":[55],"novel":[56],"insights":[57],"into":[58],"cross-lingual":[59,160],"visual":[60,146],"grounding":[61],"mechanisms.":[62],"Through":[63],"extensive":[64],"experiments":[65],"newly":[67],"translated":[68],"VisDial":[69],"datasets,":[70],"we":[71],"compare":[72],"language-specific":[73],"encoders":[74],"(BERTimbau":[75],"Portuguese,":[77],"BETO":[78],"Spanish)":[80],"against":[81],"BERT,":[83],"achieving":[84],"competitive":[85],"performance":[86],"with":[87,119],"while":[90,139],"revealing":[91],"distinct":[92],"cross-modal":[93],"attention":[94,114,137],"patterns.":[95],"Our":[96],"mechanistic":[97],"interpretability":[98],"analysis":[99],"demonstrates":[100],"that":[101,132],"despite":[102],"different":[103],"tokenization":[104],"strategies":[105,138],"pretraining":[107],"objectives,":[108],"both":[109],"approaches":[110],"converge":[111],"similar":[113],"distributions":[115],"in":[116,162],"deeper":[117],"layers,":[118],"divergence":[120],"decreasing":[121],"from":[122],"0.000832":[123],"(Layer":[124,128],"0)":[125],"0.000490":[127],"11).":[129],"find":[131],"exhibit":[135],"holistic":[136],"show":[142],"more":[143],"selective,":[144],"fine-grained":[145],"grounding.":[147],"These":[148],"findings":[149],"have":[150],"important":[151],"implications":[152],"developing":[154],"inclusive":[155],"vision-language":[156],"technologies":[157],"understanding":[159],"transfer":[161],"contexts.":[164]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-04T00:00:00"}
