{"id":"https://openalex.org/W2084827375","doi":"https://doi.org/10.4304/jmm.5.4.322-331","title":"Multimodal Speaker Segmentation and Identification in Presence of Overlapped Speech Segments","display_name":"Multimodal Speaker Segmentation and Identification in Presence of Overlapped Speech Segments","publication_year":2010,"publication_date":"2010-08-13","ids":{"openalex":"https://openalex.org/W2084827375","doi":"https://doi.org/10.4304/jmm.5.4.322-331","mag":"2084827375"},"language":"en","primary_location":{"id":"doi:10.4304/jmm.5.4.322-331","is_oa":false,"landing_page_url":"https://doi.org/10.4304/jmm.5.4.322-331","pdf_url":null,"source":{"id":"https://openalex.org/S16006927","display_name":"Journal of Multimedia","issn_l":"1796-2048","issn":["1796-2048"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310318660","host_organization_name":"Academy Publisher","host_organization_lineage":["https://openalex.org/P4310318660"],"host_organization_lineage_names":["Academy Publisher"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5091009689","display_name":"Viktor Rozgi\u0107","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Viktor Rozgic","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106859201","display_name":"Kyu J. Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kyu J. Han","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021678540","display_name":"Panayiotis Georgiou","orcid":"https://orcid.org/0000-0002-0790-7161"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Panayiotis G. Georgiou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5010028928","display_name":"Shrikanth Narayanan","orcid":"https://orcid.org/0000-0002-1052-6204"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shrikanth Narayanan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5091009689"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.0114,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.76319544,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":"5","issue":"4","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9934999942779541,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9919000267982483,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7788673639297485},{"id":"https://openalex.org/keywords/speaker-identification","display_name":"Speaker identification","score":0.7732948064804077},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7139357328414917},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6958339810371399},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.5727526545524597},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5327306985855103},{"id":"https://openalex.org/keywords/speech-segmentation","display_name":"Speech segmentation","score":0.43195879459381104},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.41427066922187805},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.36698511242866516},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.364069402217865}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7788673639297485},{"id":"https://openalex.org/C2986627078","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker identification","level":3,"score":0.7732948064804077},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7139357328414917},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6958339810371399},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.5727526545524597},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5327306985855103},{"id":"https://openalex.org/C207030507","wikidata":"https://www.wikidata.org/wiki/Q2266173","display_name":"Speech segmentation","level":3,"score":0.43195879459381104},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.41427066922187805},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36698511242866516},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.364069402217865},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C59822182","wikidata":"https://www.wikidata.org/wiki/Q441","display_name":"Botany","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.4304/jmm.5.4.322-331","is_oa":false,"landing_page_url":"https://doi.org/10.4304/jmm.5.4.322-331","pdf_url":null,"source":{"id":"https://openalex.org/S16006927","display_name":"Journal of Multimedia","issn_l":"1796-2048","issn":["1796-2048"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310318660","host_organization_name":"Academy Publisher","host_organization_lineage":["https://openalex.org/P4310318660"],"host_organization_lineage_names":["Academy Publisher"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2126085626","https://openalex.org/W1521049138","https://openalex.org/W2129993754","https://openalex.org/W1485877426","https://openalex.org/W2123654563","https://openalex.org/W4221114270","https://openalex.org/W2921830315","https://openalex.org/W2997785206","https://openalex.org/W2185667427","https://openalex.org/W2060378667"],"abstract_inverted_index":{"We":[0,63,155],"describe":[1],"a":[2,17,29,35,42,49,65,127,174,190],"multimodal":[3],"algorithm":[4],"for":[5,32,38,53,58,161,200,222],"speaker":[6,39,43,131,209,215],"segmentation":[7,210],"and":[8,41,87,137,157,168,187,217,231],"identification":[9,44,216],"with":[10,60,224],"two":[11,159],"main":[12],"contributions:":[13],"First,":[14],"we":[15,47],"propose":[16,64],"hidden":[18,123],"Markov":[19,124],"model":[20,52,125],"architecture":[21],"that":[22,79,178],"performs":[23,179],"fusion":[24],"of":[25,67,98,105,129,134,144,150,165,228],"three":[26],"information":[27],"sources:":[28],"multicamera":[30],"system":[31],"participant":[33],"localization,":[34,40],"microphone":[36,55,85,93,219],"array":[37,56,94,220],"system.":[45],"Second,":[46],"present":[48,135,156],"novel":[50],"likelihood":[51,97],"the":[54,68,83,99,111,121,130,138,142,148,162,166,169,172,188,194,204,238],"observations":[57,185],"dealing":[59],"overlapped":[61,229],"speech.":[62],"modification":[66],"Steered":[69],"Power":[70],"Response":[71],"Generalized":[72],"Cross":[73],"Correlation":[74],"Phase":[75],"Transform":[76],"(SPR-GCC-PHAT)":[77],"function":[78],"takes":[80],"into":[81],"account":[82],"possible":[84,152],"occlusions":[86],"use":[88],"its":[89],"local":[90,101],"maxima":[91,102],"as":[92,183,233,235],"observations.":[95],"The":[96,118],"extracted":[100],"given":[103],"positions":[104],"active":[106],"speakers":[107],"is":[108,126,141],"modeled":[109],"using":[110,193],"Joint":[112],"Probabilistic":[113],"Data":[114],"Association":[115],"(JPDA)":[116],"framework.":[117],"state":[119],"in":[120],"proposed":[122,205],"vector":[128],"activity":[132],"indicators":[133],"participants,":[136],"unknown":[139,170],"parameter":[140],"mapping":[143],"participants\u2019":[145,153],"locations":[146],"to":[147],"set":[149],"all":[151],"identities.":[154],"compare":[158],"ways":[160],"joint":[163],"estimation":[164],"states":[167],"parameter:":[171],"first,":[173],"forward":[175],"Bayesian":[176],"filter":[177],"sequential":[180],"estimate":[181],"updates":[182],"new":[184],"arrive":[186],"second,":[189],"batch":[191],"decoding":[192,202],"Viterbi":[195],"algorithm.":[196],"Results":[197],"show":[198],"that,":[199],"both":[201],"algorithms,":[203],"method":[206],"outperforms":[207],"standard":[208],"systems":[211],"based":[212],"on":[213,237],"(a)":[214],"(b)":[218],"processing,":[221],"dataset":[223],"significant":[225],"portion":[226],"(27.4%)":[227],"speech":[230],"scores":[232],"high":[234],"94.4%":[236],"F-measure":[239],"scale.":[240]},"counts_by_year":[{"year":2022,"cited_by_count":1},{"year":2019,"cited_by_count":2},{"year":2018,"cited_by_count":1},{"year":2014,"cited_by_count":1},{"year":2013,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
