{"id":"https://openalex.org/W4415540543","doi":"https://doi.org/10.1145/3746027.3755633","title":"Seeing Through Ambiguity: Effective Video-guided Machine Translation via Chaotic Fusion and Causally Aligned Spatio-temporal Attention","display_name":"Seeing Through Ambiguity: Effective Video-guided Machine Translation via Chaotic Fusion and Causally Aligned Spatio-temporal Attention","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415540543","doi":"https://doi.org/10.1145/3746027.3755633"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755633","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755633","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102495507","display_name":"Jiawei Zheng","orcid":"https://orcid.org/0009-0003-9643-3643"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiawei Zheng","raw_affiliation_strings":["School of Informatics, Xiamen University, Xiamen, China"],"affiliations":[{"raw_affiliation_string":"School of Informatics, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103326399","display_name":"Feiyan Liu","orcid":"https://orcid.org/0009-0006-8277-6380"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feiyan Liu","raw_affiliation_strings":["School of Informatics, Xiamen University, Xiamen, China"],"affiliations":[{"raw_affiliation_string":"School of Informatics, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100456913","display_name":"Xiaoli Wang","orcid":"https://orcid.org/0000-0002-8677-9080"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoli Wang","raw_affiliation_strings":["Super Intelligent Medical Innovation Research Center, School of Informatics, Xiamen University, Xiamen, China"],"affiliations":[{"raw_affiliation_string":"Super Intelligent Medical Innovation Research Center, School of Informatics, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5102495507"],"corresponding_institution_ids":["https://openalex.org/I191208505"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32642964,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"4837","last_page":"4845"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9927999973297119,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.6347000002861023},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6136999726295471},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.5145999789237976},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4945000112056732},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.43689998984336853},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.42640000581741333},{"id":"https://openalex.org/keywords/fusion-mechanism","display_name":"Fusion mechanism","score":0.39309999346733093},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3562000095844269}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8525999784469604},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6468999981880188},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.6347000002861023},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6136999726295471},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.5145999789237976},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4945000112056732},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.43689998984336853},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.42640000581741333},{"id":"https://openalex.org/C173414695","wikidata":"https://www.wikidata.org/wiki/Q5510276","display_name":"Fusion mechanism","level":4,"score":0.39309999346733093},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38100001215934753},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.37059998512268066},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3562000095844269},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3472000062465668},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.3050000071525574},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.29829999804496765},{"id":"https://openalex.org/C2777052490","wikidata":"https://www.wikidata.org/wiki/Q5072826","display_name":"Chaotic","level":2,"score":0.29319998621940613},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.27250000834465027},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2685999870300293},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.2619999945163727},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.25270000100135803},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755633","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755633","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W1597944220","https://openalex.org/W2006642504","https://openalex.org/W2133459682","https://openalex.org/W2513263213","https://openalex.org/W2963524571","https://openalex.org/W2989322838","https://openalex.org/W4214911019","https://openalex.org/W4221155857"],"related_works":[],"abstract_inverted_index":{"Video-guided":[0],"machine":[1],"translation":[2,23],"(VMT)":[3],"involves":[4],"taking":[5],"text":[6],"and":[7,37,54,83,110,129,146],"video":[8,53,109,147],"modalities":[9,75,148],"as":[10],"inputs,":[11],"leveraging":[12],"visual":[13,38,61,82,88],"context":[14],"to":[15,30,69,99],"resolve":[16],"the":[17,22,31,42,108],"semantic":[18,79,101],"ambiguities":[19,123],"for":[20],"improving":[21],"quality.":[24],"This":[25],"task":[26],"remains":[27],"challenging":[28],"due":[29],"difficulty":[32],"of":[33],"effective":[34],"cross-modal":[35],"integration":[36],"grounding.":[39],"To":[40,86],"address":[41],"issues,":[43],"we":[44],"propose":[45,116],"a":[46,65,90],"novel":[47],"VMT":[48],"model":[49,135],"that":[50,76,133,143],"combines":[51],"temporal":[52],"spatial":[55],"keyframe":[56,111,145],"streams":[57],"by":[58,103],"providing":[59],"complementary":[60],"cues.":[62,85],"We":[63,114],"develop":[64],"chaotic":[66],"fusion":[67],"mechanism":[68,95],"integrate":[70],"modality-specific":[71],"representations":[72],"from":[73],"various":[74],"help":[77],"capture":[78],"interactions":[80],"between":[81],"textual":[84],"improve":[87],"grounding,":[89],"causally":[91],"aligned":[92],"spatio-temporal":[93],"attention":[94,106],"is":[96,156],"also":[97,141],"designed":[98],"enhance":[100],"alignment":[102],"refining":[104],"decoder-side":[105],"over":[107],"streams,":[112],"respectively.":[113],"further":[115],"PolyVTE,":[117],"an":[118],"evaluation":[119],"dataset":[120,155],"targeting":[121],"polysemous":[122],"in":[124],"VMT.":[125],"Results":[126],"on":[127],"VATEX":[128],"PolyVTE":[130,154],"datasets":[131],"show":[132],"our":[134],"outperforms":[136],"state-of-the-art":[137],"models.":[138],"The":[139,153],"results":[140],"prove":[142],"using":[144],"significantly":[149],"improves":[150],"disambiguation":[151],"capabilities.":[152],"available":[157],"at":[158],"https://github.com/zheng5d/PolyVTE.":[159]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-25T00:00:00"}
