{"id":"https://openalex.org/W7135009930","doi":"https://doi.org/10.48550/arxiv.2603.10468","title":"G-STAR: End-to-End Global Speaker-Tracking Attributed Recognition","display_name":"G-STAR: End-to-End Global Speaker-Tracking Attributed Recognition","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135009930","doi":"https://doi.org/10.48550/arxiv.2603.10468"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.10468","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10468","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.10468","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128870886","display_name":"Jing Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128816548","display_name":"Ziyi Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Ziyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128826388","display_name":"Haoyu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128805654","display_name":"Yucheng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yucheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128851970","display_name":"Duo Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Duo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128827532","display_name":"Mengtian Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Mengtian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125700088","display_name":"Yunfan Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Yunfan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128879113","display_name":"Dezhu Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Dezhu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128850056","display_name":"Kai Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Kai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128880059","display_name":"Shuai Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shuai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9733999967575073,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9733999967575073,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.00930000003427267,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.004399999976158142,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5605999827384949},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.5483999848365784},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.529699981212616},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.5192000269889832},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.48350000381469727},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.4287000000476837},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.397599995136261}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6342999935150146},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5605999827384949},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.5483999848365784},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.529699981212616},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.5192000269889832},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.48350000381469727},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.45399999618530273},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.4287000000476837},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4169999957084656},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.397599995136261},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.39579999446868896},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.31049999594688416},{"id":"https://openalex.org/C59656382","wikidata":"https://www.wikidata.org/wiki/Q191536","display_name":"Conjunction (astronomy)","level":2,"score":0.295199990272522},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.2818000018596649},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2775000035762787},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.271699994802475},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.26179999113082886}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.10468","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10468","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.10468","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10468","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,56],"study":[1],"timestamped":[2],"speaker-attributed":[3,125],"automatic":[4],"speech":[5,11],"recognition":[6],"(SA-ASR)":[7],"for":[8],"long-form,":[9],"multi-party":[10],"with":[12,68,79],"overlap.":[13],"In":[14],"this":[15],"setting,":[16],"chunk-wise":[17,110],"inference":[18],"must":[19],"preserve":[20],"meeting-level":[21],"speaker":[22,77],"identity":[23,54],"consistency":[24],"while":[25],"producing":[26],"time-stamped,":[27],"speaker-labeled":[28],"transcripts.":[29],"Prior":[30],"Speech-LLM":[31,70],"systems":[32],"tend":[33],"to":[34,45],"prioritize":[35],"either":[36],"local":[37,117],"diarization":[38],"or":[39],"global":[40,121],"labeling,":[41],"lacking":[42],"the":[43,83],"ability":[44],"jointly":[46],"model":[47],"fine-grained":[48],"temporal":[49,80],"boundaries":[50],"and":[51,82,96,106,119],"robust":[52],"cross-chunk":[53],"linking.":[55],"propose":[57],"G-STAR,":[58],"an":[59],"end-to-end":[60,98],"framework":[61],"that":[62],"couples":[63],"a":[64,69],"cache-conditioned":[65],"speaker-tracking":[66],"module":[67],"transcription":[71,126],"backbone.":[72],"The":[73],"tracker":[74],"provides":[75],"structured":[76],"cues":[78],"grounding,":[81],"LLM":[84],"generates":[85],"attributed":[86],"text":[87],"conditioned":[88],"on":[89,114],"these":[90],"cues.":[91],"G-STAR":[92],"supports":[93],"component-wise":[94],"optimization":[95],"joint":[97],"training,":[99],"enabling":[100],"flexible":[101],"learning":[102],"under":[103],"heterogeneous":[104],"supervision":[105],"domain":[107],"shift.":[108],"Under":[109],"decoding":[111],"protocols,":[112],"experiments":[113],"both":[115],"oracle-segmented":[116],"evaluation":[118,122],"full-meeting":[120],"show":[123],"strong":[124],"performance.":[127]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-03-13T00:00:00"}
