{"id":"https://openalex.org/W7164858911","doi":"https://doi.org/10.1145/3805622.3810702","title":"DenseSpeech: Dense Multi-Segment Temporal Grounding in Public Speaking Videos","display_name":"DenseSpeech: Dense Multi-Segment Temporal Grounding in Public Speaking Videos","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164858911","doi":"https://doi.org/10.1145/3805622.3810702"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810702","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810702","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810702","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102663760","display_name":"Jiachen Tan","orcid":"https://orcid.org/0009-0006-0410-7499"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiachen Tan","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-1059-7037","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100329603","display_name":"Tingting Zhang","orcid":"https://orcid.org/0000-0001-8182-6346"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tingting Zhang","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-8182-6346","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137976726","display_name":"Tao Zhou (117050)","orcid":null},"institutions":[{"id":"https://openalex.org/I6507939","display_name":"China United Network Communications Group (China)","ror":"https://ror.org/028w99c90","country_code":"CN","type":"company","lineage":["https://openalex.org/I6507939"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Zhou","raw_affiliation_strings":["China Unicom Software Research Institute, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-2148-8473","affiliations":[{"raw_affiliation_string":"China Unicom Software Research Institute, Beijing, China","institution_ids":["https://openalex.org/I6507939"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138643795","display_name":"Guangyao Su","orcid":"https://orcid.org/0009-0005-3206-0651"},"institutions":[{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]},{"id":"https://openalex.org/I6507939","display_name":"China United Network Communications Group (China)","ror":"https://ror.org/028w99c90","country_code":"CN","type":"company","lineage":["https://openalex.org/I6507939"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangyao Su","raw_affiliation_strings":["China Unicom Software Research Institute, Beijing, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-3206-0651","affiliations":[{"raw_affiliation_string":"China Unicom Software Research Institute, Beijing, Beijing, China","institution_ids":["https://openalex.org/I6507939","https://openalex.org/I4210128818"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102350905","display_name":"Jianwei Fang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]},{"id":"https://openalex.org/I6507939","display_name":"China United Network Communications Group (China)","ror":"https://ror.org/028w99c90","country_code":"CN","type":"company","lineage":["https://openalex.org/I6507939"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwei Fang","raw_affiliation_strings":["China Unicom Software Research Institute, Beijing, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-7082-3356","affiliations":[{"raw_affiliation_string":"China Unicom Software Research Institute, Beijing, Beijing, China","institution_ids":["https://openalex.org/I6507939","https://openalex.org/I4210128818"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101432128","display_name":"Bin Wu","orcid":"https://orcid.org/0000-0002-7112-126X"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bin Wu","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-7112-126X","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5064935950","display_name":"Chunping Zheng","orcid":"https://orcid.org/0000-0002-0407-2381"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chunping Zheng","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-0407-2381","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.94032194,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1317","last_page":"1326"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7096999883651733,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7096999883651733,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.09269999712705612,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.06279999762773514,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.6582000255584717},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5741999745368958},{"id":"https://openalex.org/keywords/boundary","display_name":"Boundary (topology)","score":0.5098000168800354},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.5084999799728394},{"id":"https://openalex.org/keywords/paragraph","display_name":"Paragraph","score":0.453900009393692},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.44780001044273376},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3400000035762787},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.3287000060081482}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7807000279426575},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.6582000255584717},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5741999745368958},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5493999719619751},{"id":"https://openalex.org/C62354387","wikidata":"https://www.wikidata.org/wiki/Q875399","display_name":"Boundary (topology)","level":2,"score":0.5098000168800354},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.5084999799728394},{"id":"https://openalex.org/C2777206241","wikidata":"https://www.wikidata.org/wiki/Q194431","display_name":"Paragraph","level":2,"score":0.453900009393692},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4496999979019165},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.44780001044273376},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3400000035762787},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3379000127315521},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.3287000060081482},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3239000141620636},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3100000023841858},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.27070000767707825},{"id":"https://openalex.org/C2776141515","wikidata":"https://www.wikidata.org/wiki/Q1274479","display_name":"Repetition (rhetorical device)","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.2599000036716461},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810702","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810702","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810702","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810702","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.6105469465255737,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1814040290","display_name":null,"funder_award_id":"Grant No. 62372060","funder_id":"https://openalex.org/F4320336806","funder_display_name":"National Natural Science Foundation of China - State Grid Corporation Joint Fund for Smart Grid"}],"funders":[{"id":"https://openalex.org/F4320336806","display_name":"National Natural Science Foundation of China - State Grid Corporation Joint Fund for Smart Grid","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1969945102","https://openalex.org/W2025827175","https://openalex.org/W2111078031","https://openalex.org/W2526050071","https://openalex.org/W2963017553","https://openalex.org/W2963524571","https://openalex.org/W2964089981","https://openalex.org/W3109908659","https://openalex.org/W3174572181","https://openalex.org/W3175402857","https://openalex.org/W4224993577","https://openalex.org/W4312519428","https://openalex.org/W4386066129","https://openalex.org/W4386083094","https://openalex.org/W4387968116","https://openalex.org/W4390873341","https://openalex.org/W4392562622","https://openalex.org/W4402671847","https://openalex.org/W4402727552","https://openalex.org/W4402753830","https://openalex.org/W4402754238","https://openalex.org/W4402917053","https://openalex.org/W4404784276","https://openalex.org/W4405152495","https://openalex.org/W4409346443","https://openalex.org/W4409364991","https://openalex.org/W4409369885","https://openalex.org/W4413158147","https://openalex.org/W4415797954"],"related_works":[],"abstract_inverted_index":{"Temporal":[0],"video":[1,70,83],"grounding":[2,18,71,84,126],"aims":[3],"to":[4,49,56,72,136,150],"localize":[5],"temporal":[6,139],"segments":[7,54,99],"in":[8,36,86],"untrimmed":[9],"videos":[10],"based":[11],"on":[12,193],"natural":[13],"language":[14],"queries.":[15],"While":[16],"dense":[17,69,124],"methods":[19],"accept":[20],"paragraph":[21],"queries,":[22],"they":[23],"retain":[24],"a":[25,44,73,152],"restrictive":[26],"one-to-one":[27],"mapping":[28],"between":[29],"sentences":[30],"and":[31,52,60,76,111,141,173,195,204],"segments.":[32],"This":[33],"assumption":[34],"fails":[35],"real-world":[37],"behavioral":[38],"analysis":[39],"like":[40],"public":[41,196],"speaking,":[42],"where":[43],"single":[45],"query":[46],"often":[47],"corresponds":[48],"multiple,":[50],"scattered,":[51],"overlapping":[53],"due":[55],"event":[57,113],"repetition,":[58],"co-occurrence,":[59],"cross-modal":[61],"complexity.":[62],"To":[63],"address":[64],"this":[65],"gap,":[66],"we":[67],"extend":[68],"multi-segment":[74],"setting":[75],"present":[77],"DenseSpeech,":[78],"the":[79,87,202],"first":[80],"densely":[81],"annotated":[82,98],"dataset":[85,203],"speech":[88],"domain.":[89],"DenseSpeech":[90,194],"comprises":[91],"1,800":[92],"authentic":[93],"classroom":[94],"recordings":[95],"with":[96,115,147],"13,880":[97],"spanning":[100],"over":[101],"52":[102],"hours,":[103],"featuring":[104],"high":[105],"annotation":[106],"density,":[107],"frequent":[108],"boundary":[109],"overlap,":[110],"pervasive":[112],"repetition":[114],"form":[116],"variability.":[117],"We":[118],"further":[119],"propose":[120],"DenseMSG,":[121],"an":[122],"end-to-end":[123],"multi-scale":[125,153],"model.":[127,205],"Its":[128],"Cross-modal":[129],"Consistent":[130],"Interaction":[131],"module":[132,159],"employs":[133],"text-guided":[134],"cross-attention":[135],"highlight":[137],"query-relevant":[138],"regions":[140],"performs":[142],"hierarchical":[143],"bidirectional":[144,161],"audiovisual":[145],"fusion":[146],"adaptive":[148],"gating":[149],"build":[151],"feature":[154,162],"pyramid.":[155],"A":[156],"Multi-Scale":[157],"Integration":[158],"enables":[160],"propagation":[163],"across":[164],"scales":[165],"for":[166],"complementary":[167],"global-local":[168],"modeling,":[169],"while":[170],"similarity-based":[171],"classification":[172],"query-aware":[174],"regression":[175],"heads":[176],"jointly":[177],"predict":[178],"boundaries":[179],"at":[180],"all":[181],"scales,":[182],"naturally":[183],"supporting":[184],"one-to-many":[185],"retrieval.":[186],"Experiments":[187],"show":[188],"DenseMSG":[189],"achieves":[190],"state-of-the-art":[191],"performance":[192],"benchmarks":[197],"(TACoS,":[198],"ActivityNet":[199],"Captions),":[200],"validating":[201]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
