{"id":"https://openalex.org/W7123666185","doi":"https://doi.org/10.1109/mmsp64401.2025.11324394","title":"CG-SMFNet: Consensus-Guided Selective Multimodal Fusion for Weakly Supervised Temporal Action Localization","display_name":"CG-SMFNet: Consensus-Guided Selective Multimodal Fusion for Weakly Supervised Temporal Action Localization","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7123666185","doi":"https://doi.org/10.1109/mmsp64401.2025.11324394"},"language":null,"primary_location":{"id":"doi:10.1109/mmsp64401.2025.11324394","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp64401.2025.11324394","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100346777","display_name":"Peng Liu","orcid":"https://orcid.org/0000-0001-9702-6888"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Peng Liu","raw_affiliation_strings":["Beihang University,School of Instrumentation and Optoelectronic Engineering,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,School of Instrumentation and Optoelectronic Engineering,Beijing,China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5104327841","display_name":"Zitai Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zitai Jiang","raw_affiliation_strings":["Beihang University,School of Instrumentation and Optoelectronic Engineering,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,School of Instrumentation and Optoelectronic Engineering,Beijing,China","institution_ids":["https://openalex.org/I82880672"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5100346777"],"corresponding_institution_ids":["https://openalex.org/I82880672"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.66372472,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"84","last_page":"89"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9690999984741211,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9690999984741211,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.008799999952316284,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.003800000064074993,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/complementarity","display_name":"Complementarity (molecular biology)","score":0.635200023651123},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.5665000081062317},{"id":"https://openalex.org/keywords/fusion-mechanism","display_name":"Fusion mechanism","score":0.5112000107765198},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.4966999888420105},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.44760000705718994},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4050000011920929},{"id":"https://openalex.org/keywords/sharpening","display_name":"Sharpening","score":0.3955000042915344},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.383899986743927}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.763700008392334},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7073000073432922},{"id":"https://openalex.org/C202269582","wikidata":"https://www.wikidata.org/wiki/Q2644277","display_name":"Complementarity (molecular biology)","level":2,"score":0.635200023651123},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.5665000081062317},{"id":"https://openalex.org/C173414695","wikidata":"https://www.wikidata.org/wiki/Q5510276","display_name":"Fusion mechanism","level":4,"score":0.5112000107765198},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.4966999888420105},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.44760000705718994},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4050000011920929},{"id":"https://openalex.org/C2781137444","wikidata":"https://www.wikidata.org/wiki/Q237105","display_name":"Sharpening","level":2,"score":0.3955000042915344},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.383899986743927},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.37790000438690186},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.3686999976634979},{"id":"https://openalex.org/C2781140086","wikidata":"https://www.wikidata.org/wiki/Q557945","display_name":"Confusion","level":2,"score":0.3538999855518341},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.34850001335144043},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.33309999108314514},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.32670000195503235},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3000999987125397},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.29350000619888306},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.2824000120162964},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.2700999975204468},{"id":"https://openalex.org/C2775953691","wikidata":"https://www.wikidata.org/wiki/Q5013874","display_name":"CRFS","level":3,"score":0.25679999589920044},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.2540999948978424}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/mmsp64401.2025.11324394","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp64401.2025.11324394","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W1927052826","https://openalex.org/W1967014153","https://openalex.org/W2336403884","https://openalex.org/W2963541464","https://openalex.org/W3014641072","https://openalex.org/W3097664769","https://openalex.org/W3102569991","https://openalex.org/W3173459793","https://openalex.org/W3207172562","https://openalex.org/W4225505766","https://openalex.org/W4226500165","https://openalex.org/W4289535637","https://openalex.org/W4321021995","https://openalex.org/W4386066210","https://openalex.org/W4387526084","https://openalex.org/W4388430502","https://openalex.org/W4393206831","https://openalex.org/W4406354232","https://openalex.org/W4413146922"],"related_works":[],"abstract_inverted_index":{"Weakly":[0],"supervised":[1],"temporal":[2,18],"action":[3,11,43,109],"localization":[4,41,136],"(WTAL)":[5],"targets":[6],"the":[7,59,73,94,124],"joint":[8],"classification":[9],"of":[10,16,31,42,75,108],"categories":[12],"and":[13,45,50,98,129],"precise":[14],"delineation":[15],"their":[17],"boundaries":[19],"in":[20],"untrimmed":[21],"videos":[22],"while":[23],"relying":[24],"only":[25],"on":[26,140],"video-level":[27],"labels.":[28],"The":[29],"absence":[30],"frame-level":[32],"supervision":[33],"inevitably":[34],"causes":[35],"two":[36],"key":[37],"difficulties:":[38],"(i)":[39],"incomplete":[40],"segments":[44,128],"(ii)":[46],"confusion":[47],"between":[48,126],"foreground":[49,127],"background":[51,132],"frames.":[52],"To":[53],"overcome":[54],"these":[55],"challenges,":[56],"we":[57],"propose":[58],"Consensus-Guided":[60],"Selective":[61,68],"Multimodal":[62],"Fusion":[63,69],"Network":[64],"(CG-SMFNet).":[65],"First,":[66],"a":[67,84,104,112,118],"Module":[70],"(SFM)":[71],"exploits":[72],"complementarity":[74],"multimodal":[76],"cues":[77],"to":[78,93],"distill":[79],"rich":[80],"semantic":[81,119],"representations.":[82],"Second,":[83],"Consensus":[85],"Attention":[86],"Mechanism":[87,115],"(CAM)":[88],"dynamically":[89],"assigns":[90],"fusion":[91],"weights":[92],"three":[95],"modality":[96],"branches":[97],"enables":[99],"bidirectional":[100],"information":[101],"exchange,":[102],"ensuring":[103],"more":[105],"holistic":[106],"capture":[107],"content.":[110],"Finally,":[111],"Discrepant":[113],"Expansion":[114],"(DEM)":[116],"introduces":[117],"contrast":[120],"loss":[121],"that":[122,144],"enlarges":[123],"distance":[125],"semantically":[130],"similar":[131],"regions,":[133],"further":[134],"sharpening":[135],"accuracy.":[137],"Extensive":[138],"experiments":[139],"public":[141],"benchmarks":[142],"verify":[143],"CG-SMFNet":[145],"achieves":[146],"state-of-the-art":[147],"performance":[148],"under":[149],"weak":[150],"supervision.":[151]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-01-14T00:00:00"}
