{"id":"https://openalex.org/W4415537437","doi":"https://doi.org/10.1145/3746027.3754833","title":"BiMa: Towards Biases Mitigation for Text-Video Retrieval via Scene Element Guidance","display_name":"BiMa: Towards Biases Mitigation for Text-Video Retrieval via Scene Element Guidance","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415537437","doi":"https://doi.org/10.1145/3746027.3754833"},"language":"en","primary_location":{"id":"doi:10.1145/3746027.3754833","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754833","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Huy Le","orcid":"https://orcid.org/0000-0003-2021-9678"},"institutions":[{"id":"https://openalex.org/I109689652","display_name":"FPT University","ror":"https://ror.org/03esj4g97","country_code":"VN","type":"education","lineage":["https://openalex.org/I109689652"]}],"countries":["VN"],"is_corresponding":true,"raw_author_name":"Huy Le","raw_affiliation_strings":["FPT Software AI Center, Hanoi, Vietnam"],"raw_orcid":"https://orcid.org/0000-0003-2021-9678","affiliations":[{"raw_affiliation_string":"FPT Software AI Center, Hanoi, Vietnam","institution_ids":["https://openalex.org/I109689652"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074271464","display_name":"Nhat Minh Chung","orcid":"https://orcid.org/0000-0003-2966-3492"},"institutions":[{"id":"https://openalex.org/I109689652","display_name":"FPT University","ror":"https://ror.org/03esj4g97","country_code":"VN","type":"education","lineage":["https://openalex.org/I109689652"]}],"countries":["VN"],"is_corresponding":false,"raw_author_name":"Nhat Chung","raw_affiliation_strings":["FPT Software AI Center, Hanoi, Vietnam"],"raw_orcid":"https://orcid.org/0000-0003-2966-3492","affiliations":[{"raw_affiliation_string":"FPT Software AI Center, Hanoi, Vietnam","institution_ids":["https://openalex.org/I109689652"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016799625","display_name":"Tung Kieu","orcid":"https://orcid.org/0000-0002-7696-1444"},"institutions":[{"id":"https://openalex.org/I891191580","display_name":"Aalborg University","ror":"https://ror.org/04m5j1k67","country_code":"DK","type":"education","lineage":["https://openalex.org/I891191580"]}],"countries":["DK"],"is_corresponding":false,"raw_author_name":"Tung Kieu","raw_affiliation_strings":["Aalborg University, Aalborg, Denmark and Pioneer Centre for AI, Copenhagen, Denmark"],"raw_orcid":"https://orcid.org/0000-0002-7696-1444","affiliations":[{"raw_affiliation_string":"Aalborg University, Aalborg, Denmark and Pioneer Centre for AI, Copenhagen, Denmark","institution_ids":["https://openalex.org/I891191580"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089006585","display_name":"Anh Nguyen","orcid":"https://orcid.org/0000-0002-1449-211X"},"institutions":[{"id":"https://openalex.org/I146655781","display_name":"University of Liverpool","ror":"https://ror.org/04xs57h96","country_code":"GB","type":"education","lineage":["https://openalex.org/I146655781"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Anh Nguyen","raw_affiliation_strings":["University of Liverpool, Liverpool, United Kingdom"],"raw_orcid":"https://orcid.org/0000-0002-1449-211X","affiliations":[{"raw_affiliation_string":"University of Liverpool, Liverpool, United Kingdom","institution_ids":["https://openalex.org/I146655781"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5023725893","display_name":"Ngan Le","orcid":"https://orcid.org/0000-0003-2571-0511"},"institutions":[{"id":"https://openalex.org/I78715868","display_name":"University of Arkansas at Fayetteville","ror":"https://ror.org/05jbt9m15","country_code":"US","type":"education","lineage":["https://openalex.org/I78715868"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ngan Le","raw_affiliation_strings":["University of Arkansas, Fayetteville, USA"],"raw_orcid":"https://orcid.org/0000-0003-2571-0511","affiliations":[{"raw_affiliation_string":"University of Arkansas, Fayetteville, USA","institution_ids":["https://openalex.org/I78715868"]}]}],"institutions":[],"countries_distinct_count":4,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I109689652"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.28658407,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2831","last_page":"2840"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.7208999991416931},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.66839998960495},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6502000093460083},{"id":"https://openalex.org/keywords/element","display_name":"Element (criminal law)","score":0.4593000113964081},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.42969998717308044},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.38339999318122864},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.3614000082015991}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7843999862670898},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.7208999991416931},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.66839998960495},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6502000093460083},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5232999920845032},{"id":"https://openalex.org/C200288055","wikidata":"https://www.wikidata.org/wiki/Q2621792","display_name":"Element (criminal law)","level":2,"score":0.4593000113964081},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.42969998717308044},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.41830000281333923},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.38339999318122864},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.3614000082015991},{"id":"https://openalex.org/C146044194","wikidata":"https://www.wikidata.org/wiki/Q5157334","display_name":"Computational photography","level":4,"score":0.3321000039577484},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.33009999990463257},{"id":"https://openalex.org/C2983174267","wikidata":"https://www.wikidata.org/wiki/Q3775098","display_name":"Video retrieval","level":2,"score":0.3109000027179718},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.29809999465942383},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.29589998722076416},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.295199990272522},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.28060001134872437},{"id":"https://openalex.org/C2778152352","wikidata":"https://www.wikidata.org/wiki/Q5165061","display_name":"Content (measure theory)","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.25609999895095825},{"id":"https://openalex.org/C148417208","wikidata":"https://www.wikidata.org/wiki/Q4825882","display_name":"Authentication (law)","level":2,"score":0.2538999915122986},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.25369998812675476}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3746027.3754833","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754833","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.atira.dk:publications/404b8417-219d-4046-86e5-94de09727f5d","is_oa":false,"landing_page_url":"https://vbn.aau.dk/da/publications/404b8417-219d-4046-86e5-94de09727f5d","pdf_url":null,"source":{"id":"https://openalex.org/S4306401731","display_name":"VBN Forskningsportal (Aalborg Universitet)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I891191580","host_organization_name":"Aalborg University","host_organization_lineage":["https://openalex.org/I891191580"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Le, H, Chung, N, Kieu, T, Nguyen, A & Le, N 2025, BiMa : Towards Biases Mitigation for Text-Video Retrieval via Scene Element Guidance. in MM 2025 : Proceedings of the 33rd ACM International Conference on Multimedia, Co-Located with MM 2025. Association for Computing Machinery (ACM), pp. 2831-2840, 33rd ACM International Conference on Multimedia, MM 2025, Dublin, Ireland, 27/10/2025. https://doi.org/10.1145/3746027.3754833","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W2914699769","https://openalex.org/W4285606530","https://openalex.org/W4307730507","https://openalex.org/W4312583258","https://openalex.org/W4393147969","https://openalex.org/W4413146810"],"related_works":[],"abstract_inverted_index":{"Text-video":[0],"retrieval":[1,144],"(TVR)":[2],"systems":[3],"often":[4],"suffer":[5],"from":[6],"visual-linguistic":[7],"biases":[8,33],"present":[9],"in":[10,34],"datasets,":[11],"which":[12],"cause":[13],"pre-trained":[14],"vision-language":[15],"models":[16],"to":[17,31,71,84,96],"overlook":[18],"key":[19],"details.":[20,76],"To":[21],"address":[22],"this,":[23],"we":[24,60,80],"propose":[25],"BiMa,":[26],"a":[27,82],"novel":[28],"framework":[29],"designed":[30],"mitigate":[32],"both":[35],"visual":[36,58],"and":[37,55,74,90,108,121],"textual":[38,78],"representations.":[39],"Our":[40],"approach":[41],"begins":[42],"by":[43,51,138],"generating":[44],"scene":[45,63],"elements":[46,64],"that":[47],"characterize":[48],"each":[49],"video":[50,67],"identifying":[52],"relevant":[53],"entities/objects":[54],"activities.":[56],"For":[57,77],"debiasing,":[59,79],"integrate":[61],"these":[62],"into":[65,88],"the":[66,94,124,130],"embeddings,":[68],"enhancing":[69],"them":[70],"emphasize":[72],"fine-grained":[73],"salient":[75],"introduce":[81],"mechanism":[83],"disentangle":[85],"text":[86],"features":[87],"content":[89,100],"bias":[91,132],"components,":[92],"enabling":[93],"model":[95],"focus":[97],"on":[98,142],"meaningful":[99],"while":[101],"separately":[102],"handling":[103],"biased":[104],"information.":[105],"Extensive":[106],"experiments":[107],"ablation":[109],"studies":[110],"across":[111],"five":[112],"major":[113],"TVR":[114],"benchmarks":[115],"(i.e.,":[116],"MSR-VTT,":[117],"MSVD,":[118],"LSMDC,":[119],"ActivityNet,":[120],"DiDeMo)":[122],"demonstrate":[123],"competitive":[125],"performance":[126],"of":[127],"BiMa.":[128],"Additionally,":[129],"model's":[131],"mitigation":[133],"capability":[134],"is":[135],"consistently":[136],"validated":[137],"its":[139],"strong":[140],"results":[141],"out-of-distribution":[143],"tasks.":[145]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-25T00:00:00"}
