{"id":"https://openalex.org/W7111372480","doi":"https://doi.org/10.48550/arxiv.2512.06334","title":"Enhanced Multimodal Video Retrieval System: Integrating Query Expansion and Cross-modal Temporal Event Retrieval","display_name":"Enhanced Multimodal Video Retrieval System: Integrating Query Expansion and Cross-modal Temporal Event Retrieval","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7111372480","doi":"https://doi.org/10.48550/arxiv.2512.06334"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2512.06334","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.06334","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2512.06334","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Vo, Van-Thinh","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Vo, Van-Thinh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Nguyen, Minh-Khoi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, Minh-Khoi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Tran, Minh-Huy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tran, Minh-Huy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Nguyen-Tran, Anh-Quan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen-Tran, Anh-Quan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Nguyen, Duy-Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, Duy-Tan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Nguyen, Khanh-Loi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, Khanh-Loi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Phan, Anh-Minh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Phan, Anh-Minh","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9010000228881836,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9010000228881836,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.04280000180006027,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.017500000074505806,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6376000046730042},{"id":"https://openalex.org/keywords/video-retrieval","display_name":"Video retrieval","score":0.613099992275238},{"id":"https://openalex.org/keywords/query-expansion","display_name":"Query expansion","score":0.5648999810218811},{"id":"https://openalex.org/keywords/thresholding","display_name":"Thresholding","score":0.5130000114440918},{"id":"https://openalex.org/keywords/visual-word","display_name":"Visual Word","score":0.48750001192092896},{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.4602999985218048},{"id":"https://openalex.org/keywords/kernel-density-estimation","display_name":"Kernel density estimation","score":0.4113999903202057},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.40880000591278076},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.3853999972343445}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8335000276565552},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6376000046730042},{"id":"https://openalex.org/C2983174267","wikidata":"https://www.wikidata.org/wiki/Q3775098","display_name":"Video retrieval","level":2,"score":0.613099992275238},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.565500020980835},{"id":"https://openalex.org/C99016210","wikidata":"https://www.wikidata.org/wiki/Q5488129","display_name":"Query expansion","level":2,"score":0.5648999810218811},{"id":"https://openalex.org/C191178318","wikidata":"https://www.wikidata.org/wiki/Q2256906","display_name":"Thresholding","level":3,"score":0.5130000114440918},{"id":"https://openalex.org/C189391414","wikidata":"https://www.wikidata.org/wiki/Q7936579","display_name":"Visual Word","level":4,"score":0.48750001192092896},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.4602999985218048},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.45899999141693115},{"id":"https://openalex.org/C71134354","wikidata":"https://www.wikidata.org/wiki/Q458825","display_name":"Kernel density estimation","level":3,"score":0.4113999903202057},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.40880000591278076},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.3853999972343445},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.33469998836517334},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.3294999897480011},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.32739999890327454},{"id":"https://openalex.org/C192028432","wikidata":"https://www.wikidata.org/wiki/Q845739","display_name":"Query language","level":2,"score":0.3199999928474426},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.31690001487731934},{"id":"https://openalex.org/C2779532271","wikidata":"https://www.wikidata.org/wiki/Q445558","display_name":"Relevance feedback","level":4,"score":0.3109999895095825},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.29280000925064087},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2782999873161316},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.2777000069618225},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2734000086784363},{"id":"https://openalex.org/C102392041","wikidata":"https://www.wikidata.org/wiki/Q592860","display_name":"Sliding window protocol","level":3,"score":0.2727000117301941},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2712000012397766},{"id":"https://openalex.org/C53533937","wikidata":"https://www.wikidata.org/wiki/Q185020","display_name":"Histogram","level":3,"score":0.26440000534057617},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2512.06334","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.06334","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2512.06334","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.06334","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5831778645515442,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimedia":[0],"information":[1],"retrieval":[2,57,111,132],"from":[3],"videos":[4],"remains":[5],"a":[6,34,53,69,119],"challenging":[7],"problem.":[8],"While":[9],"recent":[10],"systems":[11],"have":[12],"advanced":[13],"multimodal":[14],"search":[15],"through":[16,142],"semantic,":[17],"object,":[18],"and":[19,23,78,113,126,138],"OCR":[20],"queries":[21],"-":[22,29],"can":[24],"retrieve":[25],"temporally":[26],"consecutive":[27],"scenes":[28,67],"they":[30],"often":[31],"rely":[32],"on":[33],"single":[35],"query":[36,62],"modality":[37],"for":[38,75],"an":[39],"entire":[40],"sequence,":[41],"limiting":[42],"robustness":[43,139],"in":[44,146],"complex":[45],"temporal":[46,55],"contexts.":[47],"To":[48,71],"overcome":[49],"this,":[50],"we":[51,82],"propose":[52],"cross-modal":[54],"event":[56],"framework":[58],"that":[59,104],"enables":[60],"different":[61],"modalities":[63],"to":[64,124],"describe":[65],"distinct":[66],"within":[68],"sequence.":[70],"determine":[72],"decision":[73],"thresholds":[74],"scene":[76],"transition":[77],"slide":[79],"change":[80],"adaptively,":[81],"build":[83],"Kernel":[84],"Density":[85],"Gaussian":[86],"Mixture":[87],"Thresholding":[88],"(KDE-GMM)":[89],"algorithm,":[90],"ensuring":[91],"optimal":[92],"keyframe":[93],"selection.":[94],"These":[95],"extracted":[96],"keyframes":[97],"act":[98],"as":[99],"compact,":[100],"high-quality":[101],"visual":[102],"exemplars":[103],"retain":[105],"each":[106],"segment's":[107],"semantic":[108],"essence,":[109],"improving":[110],"precision":[112],"efficiency.":[114],"Additionally,":[115],"the":[116,147],"system":[117],"incorporates":[118],"large":[120],"language":[121],"model":[122],"(LLM)":[123],"refine":[125],"expand":[127],"user":[128],"queries,":[129],"enhancing":[130],"overall":[131],"performance.":[133],"The":[134],"proposed":[135],"system's":[136],"effectiveness":[137],"were":[140],"demonstrated":[141],"its":[143],"strong":[144],"results":[145],"Ho":[148],"Chi":[149],"Minh":[150],"AI":[151],"Challenge":[152],"2025.":[153]},"counts_by_year":[],"updated_date":"2025-12-10T02:49:46.989445","created_date":"2025-12-10T00:00:00"}
