{"id":"https://openalex.org/W19798656","doi":"https://doi.org/10.21437/eurospeech.1999-555","title":"A hierarchical approach to large-scale speaker recognition","display_name":"A hierarchical approach to large-scale speaker recognition","publication_year":1999,"publication_date":"1999-09-05","ids":{"openalex":"https://openalex.org/W19798656","doi":"https://doi.org/10.21437/eurospeech.1999-555","mag":"19798656"},"language":"en","primary_location":{"id":"doi:10.21437/eurospeech.1999-555","is_oa":false,"landing_page_url":"https://doi.org/10.21437/eurospeech.1999-555","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"6th European Conference on Speech Communication and Technology","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030917082","display_name":"Homayoon Beigi","orcid":"https://orcid.org/0000-0003-0127-2385"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Homayoon S. M. Beigi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090405149","display_name":"St\u00e9phane Maes","orcid":"https://orcid.org/0000-0003-0195-2313"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"St\u00e9phane H. Maes","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110218731","display_name":"Upendra V. Chaudhari","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Upendra V. Chaudhari","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5110191415","display_name":"Jeffrey Sorensen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jeffrey S. Sorensen","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5030917082"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.00301961,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"2203","last_page":"2206"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9943000078201294,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8268629312515259},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.76880943775177},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.7591915726661682},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.5586606860160828},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.5555993318557739},{"id":"https://openalex.org/keywords/usable","display_name":"USable","score":0.5526590347290039},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.5080900192260742},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.4828958511352539},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4540993273258209},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4214962422847748},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.41539692878723145},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3670019507408142}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8268629312515259},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.76880943775177},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.7591915726661682},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.5586606860160828},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.5555993318557739},{"id":"https://openalex.org/C2780615836","wikidata":"https://www.wikidata.org/wiki/Q2471869","display_name":"USable","level":2,"score":0.5526590347290039},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.5080900192260742},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4828958511352539},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4540993273258209},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4214962422847748},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.41539692878723145},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3670019507408142},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/eurospeech.1999-555","is_oa":false,"landing_page_url":"https://doi.org/10.21437/eurospeech.1999-555","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"6th European Conference on Speech Communication and Technology","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6499999761581421,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":5,"referenced_works":["https://openalex.org/W1501187080","https://openalex.org/W2025037086","https://openalex.org/W2109299006","https://openalex.org/W2147794814","https://openalex.org/W2282029110"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W4247736853","https://openalex.org/W2162158162","https://openalex.org/W1493012537","https://openalex.org/W1999004162","https://openalex.org/W2175373321","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W2938358845","https://openalex.org/W2997340161"],"abstract_inverted_index":{"CORRECTIVE":[0],"TRAINING":[1],"FOR":[2],"SPEAKER":[3],"ADAPTATIONXiuyang":[4],"Yu":[5],"and":[6,50,94,107,279,359,421,453,471],"Wayne":[7],"WardCenter":[8],"for":[9,27,123,133,181,225,246,273,408,434,506,548],"Spoken":[10],"Language":[11],"UnderstandingUniversity":[12],"of":[13,190,304,333,347,368,446,528],"Colorado,":[14],"Boulder,":[15],"ColoradoABSTRACTThis":[16],"paper":[17],"reports":[18],"results":[19,251],"on":[20,64,150,252,530],"an":[21,57,253,431,435],"experiment":[22,254],"to":[23,40,45,67,76,84,119,196,202,215,218,228,233,242,255,259,265,281,294,321,326,337,342,354,415,429,464,479,486,493,515,534],"use":[24,187,256,525],"correctivetraining":[25],"techniques":[26,174,186,206],"rapid":[28],"acoustic":[29,43,200,263],"speaker":[30,159],"adaptation":[31,58,214],"in":[32,88,102,109,164,179,210,552],"asemi-continuous":[33],"speech":[34,92,284,507],"recognition":[35,147,276,285],"system.":[36],"Decoder":[37],"outputis":[38],"used":[39,55,83,414,428,485],"adjust":[41],"HMM":[42,199,262,521,550],"models":[44,104,111,201,237,264],"improvediscrimination":[46],"between":[47],"correct":[48,78,103,234,357,376,380,396,451,467],"words":[49,358,371,381,397,468],"near":[51,338,360,369,387,410,418,447],"misses.Twenty":[52],"sentences":[53,191],"are":[54,112,117,129,207,290,299,398,413,455],"as":[56,353],"set.":[59,307],"A":[60],"speechrecognizer":[61],"is":[62,73,82,160,335,484,490,539],"run":[63],"each":[65,86,98,549],"utterance":[66,452],"generate":[68,416,430],"a":[69,141,158,182,203,243,323,366,386,423,545],"wordlattice.":[70],"The":[71,114,136,362],"lattice":[72],"pruned":[74],"relative":[75],"the":[77,91,124,130,167,220,226,240,247,302,305,313,345,375,379,383,390,417,444,450,458,466,495,501,553],"path.":[79],"Theforward-backward":[80],"algorithm":[81],"align":[85],"path":[87],"thelattice":[89],"against":[90,382,389,457],"input":[93,99],"compute":[95],"observationcounts.":[96],"For":[97],"frame,":[100],"counts":[101,108,116],"areadjusted":[105],"upward,":[106],"incorrect":[110,401],"adjusteddownward.":[113],"adjusted":[115,128,300,463],"normalized":[118],"generatenew":[120],"observation":[121,536],"probabilities":[122],"models.":[125],"Theparameters":[126],"being":[127],"mixture":[131],"weights":[132],"thesemi-continuous":[134],"HMMs.":[135],"technique":[137],"reduced":[138],"word":[139,275,314,348,437],"errorfor":[140],"test":[142],"subject":[143],"by":[144,277,286,319],"37%":[145],"relative.INTRODUCTIONSpeech":[146],"systems":[148,524,543],"based":[149],"Hidden":[151,512],"MarkovModels":[152,513],"typically":[153],"experience":[154],"significant":[155],"performancedegradation":[156],"when":[157],"not":[161,310],"represented":[162],"well":[163,223],"thedata":[165],"that":[166,395],"system":[168,221,241,499,505],"was":[169,271,427,442],"trained":[170,292],"on.":[171],"Rapid":[172],"speakeradaptation":[173],"can":[175],"be":[176,216,229],"very":[177,235],"effective":[178],"improvingperformance":[180],"novel":[183],"speaker.":[184],"These":[185],"asmall":[188],"number":[189,346],"(20-40),":[192],"whose":[193],"transcript":[194],"isknown,":[195],"quickly":[197,238],"adapt":[198,261],"newspeaker.":[204],"Such":[205],"important,":[208],"since":[209],"order":[211,478],"forlonger":[212],"term":[213],"able":[217],"work,":[219],"mustfunction":[222],"enough":[224],"user":[227],"productive.":[230],"Thesetechniques":[231],"attempt":[232],"poor":[236],"toget":[239],"usable":[244],"point":[245],"user.":[248],"This":[249,308,440],"paperreports":[250],"corrective":[257,482],"trainingtechniques":[258],"rapidly":[260],"apoorly":[266],"recognized":[267],"speaker.CORRECTIVE":[268],"TRAININGCorrective":[269],"training":[270,306,483],"introduced":[272],"speaker-dependentisolated":[274],"[1]":[278],"extended":[280],"speaker-independent":[282],"continuous":[283,542],"[2].":[287],"HiddenMarkov":[288],"Models":[289],"normally":[291],"according":[293],"amaximum":[295],"likelihood":[296],"criterion;":[297],"parameters":[298,351,461],"tomaximize":[301],"probability":[303,325],"processdoes":[309],"directly":[311,343],"minimize":[312,344],"errors,":[315],"it":[316,489],"does":[317],"thisindirectly":[318],"attempting":[320],"assign":[322],"high":[324],"thecorrect":[327],"utterance.":[328],"It":[329],"makes":[330],"no":[331],"representation":[332],"whatprobability":[334],"assigned":[336],"misses.":[339,361,419,448],"Corrective":[340],"trainingseeks":[341],"errors":[349],"byadjusting":[350],"so":[352],"improve":[355],"discriminationbetween":[356],"general":[363],"processis:1.":[364],"Generate":[365],"set":[367,445,546],"misses,":[370],"which":[372],"areconfusable":[373],"with":[374],"words.2.":[377],"Align":[378,385],"input.3.":[384],"miss":[388],"input.4.":[391],"Modify":[392],"model":[393,516],"such":[394],"more":[399,469],"likelyand":[400],"ones":[402,473],"less":[403,474],"likely.5.":[404],"Repeat":[405],"steps":[406],"2-3":[407],"other":[409],"misses.Speech":[411],"recognizers":[412],"In[1]":[420],"[2],":[422],"sp":[424],"eech":[425],"recognizer":[426],"n-best":[432],"list":[433,441],"isolated":[436],"or":[438],"sentence.":[439],"usedas":[443],"Both":[449],"nearmiss":[454],"aligned":[456],"input.":[459],"Model":[460],"arethen":[462],"make":[465],"likely":[470],"theincorrect":[472],"likely.SPHINX-II":[475],"OBSERVATION":[476],"ESTIMATESIn":[477],"describe":[480,494],"how":[481],"adaptour":[487],"model,":[488],"first":[491],"necessary":[492],"basic":[496],"model.Our":[497],"experimental":[498],"uses":[500,510],"Carnegie":[502],"MellonUniversity":[503],"Sphinx-II":[504,509],"recognition[3],[4],[5].":[508],"semi-continuous":[511,523],"[3]":[514],"context":[517],"dependent":[518],"phones.":[519],"Likecontinuous":[520],"systems,":[522],"aweighted":[526],"sum":[527],"points":[529],"Gaussian":[531],"Probability":[532],"DensityFunctions":[533],"estimate":[535,544],"probabilities.":[537],"Thedifference":[538],"that,":[540],"while":[541],"ofdistributions":[547],"state":[551],"system,":[554],"semi-":[555]},"counts_by_year":[{"year":2013,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2016-06-24T00:00:00"}
