{"id":"https://openalex.org/W7124897083","doi":"https://doi.org/10.1109/cbmi66578.2025.11339302","title":"VoiceVision: AI-Powered Speaker-Aware Cropping and Content Indexing for Multi-Speaker Videos","display_name":"VoiceVision: AI-Powered Speaker-Aware Cropping and Content Indexing for Multi-Speaker Videos","publication_year":2025,"publication_date":"2025-10-22","ids":{"openalex":"https://openalex.org/W7124897083","doi":"https://doi.org/10.1109/cbmi66578.2025.11339302"},"language":null,"primary_location":{"id":"doi:10.1109/cbmi66578.2025.11339302","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cbmi66578.2025.11339302","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Content-Based Multimedia Indexing (CBMI)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093104354","display_name":"Mehdi Houshmand Sarkhoosh","orcid":"https://orcid.org/0009-0008-4616-4592"},"institutions":[{"id":"https://openalex.org/I4210132524","display_name":"ForzaSys (Norway)","ror":"https://ror.org/02zhfb961","country_code":"NO","type":"company","lineage":["https://openalex.org/I4210132524"]}],"countries":["NO"],"is_corresponding":true,"raw_author_name":"Mehdi Houshmand Sarkhoosh","raw_affiliation_strings":["OsloMet &#x0026; Forzasys,Oslo,Norway"],"affiliations":[{"raw_affiliation_string":"OsloMet &#x0026; Forzasys,Oslo,Norway","institution_ids":["https://openalex.org/I4210132524"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068128546","display_name":"Cise Midoglu","orcid":"https://orcid.org/0000-0003-0991-4418"},"institutions":[{"id":"https://openalex.org/I4210132524","display_name":"ForzaSys (Norway)","ror":"https://ror.org/02zhfb961","country_code":"NO","type":"company","lineage":["https://openalex.org/I4210132524"]}],"countries":["NO"],"is_corresponding":false,"raw_author_name":"Cise Midoglu","raw_affiliation_strings":["Forzasys,Oslo,Norway"],"affiliations":[{"raw_affiliation_string":"Forzasys,Oslo,Norway","institution_ids":["https://openalex.org/I4210132524"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019910849","display_name":"Saeed Shafiee Sabet","orcid":"https://orcid.org/0000-0001-5348-8546"},"institutions":[{"id":"https://openalex.org/I4210132524","display_name":"ForzaSys (Norway)","ror":"https://ror.org/02zhfb961","country_code":"NO","type":"company","lineage":["https://openalex.org/I4210132524"]}],"countries":["NO"],"is_corresponding":false,"raw_author_name":"Saeed S. Sabet","raw_affiliation_strings":["Forzasys,Oslo,Norway"],"affiliations":[{"raw_affiliation_string":"Forzasys,Oslo,Norway","institution_ids":["https://openalex.org/I4210132524"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123372230","display_name":"Tomas Kupka","orcid":null},"institutions":[{"id":"https://openalex.org/I4210132524","display_name":"ForzaSys (Norway)","ror":"https://ror.org/02zhfb961","country_code":"NO","type":"company","lineage":["https://openalex.org/I4210132524"]}],"countries":["NO"],"is_corresponding":false,"raw_author_name":"Tomas Kupka","raw_affiliation_strings":["Forzasys,Oslo,Norway"],"affiliations":[{"raw_affiliation_string":"Forzasys,Oslo,Norway","institution_ids":["https://openalex.org/I4210132524"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5021525405","display_name":"P. Halvorsen","orcid":null},"institutions":[{"id":"https://openalex.org/I4210153474","display_name":"Simula Metropolitan Center for Digital Engineering","ror":"https://ror.org/04xtarr15","country_code":"NO","type":"nonprofit","lineage":["https://openalex.org/I184531372","https://openalex.org/I2799829267","https://openalex.org/I4210153474"]}],"countries":["NO"],"is_corresponding":false,"raw_author_name":"P\u00e5l Halvorsen","raw_affiliation_strings":["SimulaMet, OsloMet &#x0026; Forzasys,Oslo,Norway"],"affiliations":[{"raw_affiliation_string":"SimulaMet, OsloMet &#x0026; Forzasys,Oslo,Norway","institution_ids":["https://openalex.org/I4210153474"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5093104354"],"corresponding_institution_ids":["https://openalex.org/I4210132524"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.83224428,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.15209999680519104,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.15209999680519104,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.1362999975681305,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.12120000272989273,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.7814000248908997},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.4302000105381012},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.35100001096725464},{"id":"https://openalex.org/keywords/content","display_name":"Content (measure theory)","score":0.3271999955177307},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.30239999294281006},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.28690001368522644}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8109999895095825},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.7814000248908997},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5246000289916992},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45890000462532043},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.4302000105381012},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.35100001096725464},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.33009999990463257},{"id":"https://openalex.org/C2778152352","wikidata":"https://www.wikidata.org/wiki/Q5165061","display_name":"Content (measure theory)","level":2,"score":0.3271999955177307},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.30239999294281006},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.28690001368522644},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2854999899864197},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.2815000116825104},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.27880001068115234},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.26989999413490295},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.26080000400543213},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.25940001010894775},{"id":"https://openalex.org/C2778330532","wikidata":"https://www.wikidata.org/wiki/Q4826577","display_name":"Automatic indexing","level":3,"score":0.25279998779296875}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cbmi66578.2025.11339302","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cbmi66578.2025.11339302","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Content-Based Multimedia Indexing (CBMI)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5380911827087402}],"awards":[{"id":"https://openalex.org/G5937660701","display_name":null,"funder_award_id":"354154","funder_id":"https://openalex.org/F4320323299","funder_display_name":"Norges Forskningsr\u00e5d"}],"funders":[{"id":"https://openalex.org/F4320323299","display_name":"Norges Forskningsr\u00e5d","ror":"https://ror.org/00epmv149"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":5,"referenced_works":["https://openalex.org/W4286378963","https://openalex.org/W4391287727","https://openalex.org/W4392981906","https://openalex.org/W4401597516","https://openalex.org/W4401809421"],"related_works":[],"abstract_inverted_index":{"VoiceVision":[0],"is":[1,155],"an":[2],"AI-powered":[3],"system":[4,56],"designed":[5],"for":[6,134],"intelligent":[7],"speaker-focused":[8],"video":[9,39,120],"cropping":[10,121],"and":[11,33,36,92,132,146],"speaker-aware":[12,119],"content":[13,95,129],"indexing.":[14],"Built":[15],"on":[16,42,97,140],"top":[17],"of":[18,94,151],"the":[19,38,43,55,152],"TalkNet":[20],"audio-visual":[21],"speaker":[22,100],"diarization":[23],"backbone,":[24],"Voice":[25,102,126],"Vision":[26,103,127],"detects":[27],"active":[28],"speakers":[29],"in":[30],"multi-speaker":[31],"videos":[32,139],"dynamically":[34],"crops":[35],"reframes":[37],"to":[40,52,64,77,112],"center":[41],"current":[44],"speaker,":[45],"creating":[46],"smooth":[47],"visual":[48],"transitions.":[49],"In":[50],"addition":[51],"smart":[53],"cropping,":[54],"integrates":[57],"automatic":[58,105],"speech":[59,86],"recognition":[60],"(ASR)":[61],"using":[62],"Whisper":[63],"generate":[65,113],"accurate":[66],"transcriptions,":[67,125],"which":[68],"are":[69],"further":[70],"processed":[71],"through":[72],"a":[73],"transcript":[74],"attribution":[75],"module":[76,88],"associate":[78],"spoken":[79],"segments":[80],"with":[81,122],"specific":[82],"speakers.":[83],"A":[84,149],"dedicated":[85],"search":[87],"enables":[89],"efficient":[90],"retrieval":[91],"indexing":[93],"based":[96],"keywords":[98],"or":[99,137],"identity.":[101],"supports":[104],"aspect":[106],"ratio":[107],"adaptation":[108],"(9:16,":[109],"1:1,":[110],"4:5)":[111],"social-media-":[114],"optimized":[115],"outputs.":[116],"By":[117],"combining":[118],"searchable,":[123],"speaker-attributed":[124],"simplifies":[128],"creation,":[130],"indexing,":[131],"sharing":[133],"interview-":[135],"style":[136],"conversational":[138],"platforms":[141],"such":[142],"as":[143],"TikTok,":[144],"Instagram,":[145],"YouTube":[147],"Shorts.":[148],"demonstration":[150],"system's":[153],"capabilities":[154],"presented":[156],"at":[157],"https://youtu.be/SBSqOyMpe60":[158]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-01-21T00:00:00"}
