{"id":"https://openalex.org/W4415537401","doi":"https://doi.org/10.1145/3746027.3762090","title":"ENRIC: EveNt-AwaRe Captioning with Image Retrieval via UnCertainty-Guided Re-ranking and Semantic Ensemble Reasoning","display_name":"ENRIC: EveNt-AwaRe Captioning with Image Retrieval via UnCertainty-Guided Re-ranking and Semantic Ensemble Reasoning","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415537401","doi":"https://doi.org/10.1145/3746027.3762090"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3762090","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3762090","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5042049179","display_name":"Ngoc-Quan Nguyen","orcid":"https://orcid.org/0009-0007-7328-2781"},"institutions":[{"id":"https://openalex.org/I123565023","display_name":"Vietnam National University Ho Chi Minh City","ror":"https://ror.org/00waaqh38","country_code":"VN","type":"education","lineage":["https://openalex.org/I123565023"]}],"countries":["VN"],"is_corresponding":true,"raw_author_name":"Nam-Quan Nguyen","raw_affiliation_strings":["University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam"],"affiliations":[{"raw_affiliation_string":"University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam","institution_ids":["https://openalex.org/I123565023"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112587053","display_name":"M. Le","orcid":"https://orcid.org/0009-0001-4115-3447"},"institutions":[{"id":"https://openalex.org/I123565023","display_name":"Vietnam National University Ho Chi Minh City","ror":"https://ror.org/00waaqh38","country_code":"VN","type":"education","lineage":["https://openalex.org/I123565023"]}],"countries":["VN"],"is_corresponding":false,"raw_author_name":"Minh-Hoang Le","raw_affiliation_strings":["University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam"],"affiliations":[{"raw_affiliation_string":"University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam","institution_ids":["https://openalex.org/I123565023"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095747780","display_name":"Vinh-Toan Vong","orcid":"https://orcid.org/0009-0009-4873-3258"},"institutions":[{"id":"https://openalex.org/I123565023","display_name":"Vietnam National University Ho Chi Minh City","ror":"https://ror.org/00waaqh38","country_code":"VN","type":"education","lineage":["https://openalex.org/I123565023"]}],"countries":["VN"],"is_corresponding":false,"raw_author_name":"Vinh-Toan Vong","raw_affiliation_strings":["University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam"],"affiliations":[{"raw_affiliation_string":"University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam","institution_ids":["https://openalex.org/I123565023"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053495766","display_name":"Minh\u2013Triet Tran","orcid":"https://orcid.org/0000-0003-3046-3041"},"institutions":[{"id":"https://openalex.org/I123565023","display_name":"Vietnam National University Ho Chi Minh City","ror":"https://ror.org/00waaqh38","country_code":"VN","type":"education","lineage":["https://openalex.org/I123565023"]}],"countries":["VN"],"is_corresponding":false,"raw_author_name":"Minh-Triet Tran","raw_affiliation_strings":["University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam"],"affiliations":[{"raw_affiliation_string":"University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam","institution_ids":["https://openalex.org/I123565023"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5042049179"],"corresponding_institution_ids":["https://openalex.org/I123565023"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32290017,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"14250","last_page":"14256"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9940000176429749,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9889000058174133,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.8980000019073486},{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.5810999870300293},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.5605999827384949},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.5594000220298767},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.5544999837875366},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5054000020027161},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4251999855041504},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.41819998621940613},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.39959999918937683}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.8980000019073486},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7972000241279602},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6291000247001648},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.5810999870300293},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.5605999827384949},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.5594000220298767},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.5544999837875366},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5504000186920166},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5054000020027161},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4251999855041504},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.41819998621940613},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.39959999918937683},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3962000012397766},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3822999894618988},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3544999957084656},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.3474000096321106},{"id":"https://openalex.org/C199579030","wikidata":"https://www.wikidata.org/wiki/Q2851778","display_name":"Automatic image annotation","level":4,"score":0.34599998593330383},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.33309999108314514},{"id":"https://openalex.org/C86037889","wikidata":"https://www.wikidata.org/wiki/Q4330127","display_name":"Learning to rank","level":3,"score":0.3203999996185303},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.3075999915599823},{"id":"https://openalex.org/C2781195486","wikidata":"https://www.wikidata.org/wiki/Q289436","display_name":"Texture (cosmology)","level":3,"score":0.2971999943256378},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C2983174267","wikidata":"https://www.wikidata.org/wiki/Q3775098","display_name":"Video retrieval","level":2,"score":0.27469998598098755},{"id":"https://openalex.org/C2777946921","wikidata":"https://www.wikidata.org/wiki/Q7449044","display_name":"Semantic analysis (machine learning)","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C89992363","wikidata":"https://www.wikidata.org/wiki/Q5961558","display_name":"Track (disk drive)","level":2,"score":0.2669000029563904},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.26429998874664307},{"id":"https://openalex.org/C2987933465","wikidata":"https://www.wikidata.org/wiki/Q141130","display_name":"Image manipulation","level":3,"score":0.2599000036716461},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3762090","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3762090","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W1956340063","https://openalex.org/W2963138277","https://openalex.org/W2982573303","https://openalex.org/W3034689697","https://openalex.org/W3089978207","https://openalex.org/W3093910428","https://openalex.org/W3117993946"],"related_works":[],"abstract_inverted_index":{"In":[0,60],"many":[1],"real-world":[2],"applications,":[3],"labeling":[4],"an":[5,66],"image":[6,191],"''a":[7],"man":[8],"riding":[9],"a":[10],"horse''":[11],"fails":[12],"to":[13,47,57,144],"satisfy":[14],"demands":[15],"for":[16,70,189],"the":[17,34,71,77,146,155,184],"who,":[18],"when,":[19],"where,":[20],"and":[21,116,126,141,164,178],"why.":[22],"Although":[23],"LVLMs":[24],"excel":[25],"at":[26],"describing":[27],"visual":[28,121],"content,":[29],"isolated":[30],"images":[31,102],"often":[32],"lack":[33],"event":[35],"context;":[36],"users":[37],"thus":[38],"rely":[39],"on":[40],"related":[41],"news":[42,83],"articles":[43,84],"or":[44,52],"social":[45],"posts":[46],"enrich":[48],"them,":[49],"but":[50],"cropping":[51],"resizing":[53],"complicates":[54],"tracking":[55],"back":[56],"their":[58],"source.":[59],"this":[61],"paper,":[62],"we":[63],"propose":[64],"ENRIC,":[65],"innovative":[67],"end-to-end":[68],"system":[69,92],"EVENTA":[72],"Challenge":[73],"Track":[74],"1,":[75],"leveraging":[76],"OpenEvents-V1":[78],"dataset,":[79],"comprising":[80],"over":[81],"200,000":[82],"paired":[85],"with":[86,123],"more":[87],"than":[88],"400,000":[89],"images.":[90],"Our":[91],"includes":[93],"three":[94,113],"components:":[95],"(1)":[96],"semantic":[97,174],"retrieval":[98],"filters":[99],"candidate":[100],"article":[101],"via":[103],"vision-language":[104],"embeddings,":[105],"(2)":[106],"uncertainty-guided":[107,176],"re-ranking":[108],"flags":[109],"ambiguous":[110],"queries":[111],"using":[112],"confidence":[114],"heuristics":[115],"re-ranks":[117],"candidates":[118],"by":[119],"combining":[120,173],"similarity":[122],"texture":[124],"similarity,":[125],"(3)":[127],"event-aware":[128,179],"caption":[129,180],"generation":[130],"employs":[131],"chain-of-thought":[132],"prompting":[133],"that":[134],"aggregates":[135],"five":[136],"inputs":[137],"from":[138],"article,":[139],"image,":[140],"CIDEr-derived":[142],"contexts":[143],"guide":[145],"LLM":[147],"in":[148],"incorporating":[149],"all":[150,170],"necessary":[151],"elements.":[152],"ENRIC":[153,182],"achieved":[154],"highest":[156],"combined":[157],"evaluation":[158],"score":[159],"of":[160,186],"0.5501,":[161],"ranking":[162],"first":[163],"outperforming":[165],"other":[166],"solutions":[167],"across":[168],"nearly":[169],"metrics.":[171],"By":[172],"retrieval,":[175],"re-ranking,":[177],"generation,":[181],"demonstrates":[183],"efficiency":[185],"its":[187],"approach":[188],"event-enriched":[190],"analysis.":[192],"GitHub":[193],"repository:":[194],"https://github.com/NamQuanProject/EVENTA25-ENRIC":[195]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-25T00:00:00"}
