{"id":"https://openalex.org/W7129584482","doi":"https://doi.org/10.48550/arxiv.2602.14172","title":"Investigation for Relative Voice Impression Estimation","display_name":"Investigation for Relative Voice Impression Estimation","publication_year":2026,"publication_date":"2026-02-15","ids":{"openalex":"https://openalex.org/W7129584482","doi":"https://doi.org/10.48550/arxiv.2602.14172"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.14172","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5055226151","display_name":"Kensuke Fujita","orcid":"https://orcid.org/0000-0001-9573-7539"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Fujita, Kenichi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126206434","display_name":"Yusuke Ijima","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ijima, Yusuke","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5055226151"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.7324000000953674,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.7324000000953674,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.17260000109672546,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.01640000008046627,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.7968000173568726},{"id":"https://openalex.org/keywords/paralanguage","display_name":"Paralanguage","score":0.7354999780654907},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5989999771118164},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.5236999988555908},{"id":"https://openalex.org/keywords/impression","display_name":"Impression","score":0.4966999888420105},{"id":"https://openalex.org/keywords/syllable","display_name":"Syllable","score":0.37929999828338623},{"id":"https://openalex.org/keywords/estimation","display_name":"Estimation","score":0.3693999946117401}],"concepts":[{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.7968000173568726},{"id":"https://openalex.org/C133378560","wikidata":"https://www.wikidata.org/wiki/Q1753225","display_name":"Paralanguage","level":2,"score":0.7354999780654907},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6373000144958496},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6290000081062317},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5989999771118164},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.5236999988555908},{"id":"https://openalex.org/C2776684213","wikidata":"https://www.wikidata.org/wiki/Q6007582","display_name":"Impression","level":2,"score":0.4966999888420105},{"id":"https://openalex.org/C109089402","wikidata":"https://www.wikidata.org/wiki/Q8188","display_name":"Syllable","level":2,"score":0.37929999828338623},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37130001187324524},{"id":"https://openalex.org/C96250715","wikidata":"https://www.wikidata.org/wiki/Q965330","display_name":"Estimation","level":2,"score":0.3693999946117401},{"id":"https://openalex.org/C99209842","wikidata":"https://www.wikidata.org/wiki/Q643696","display_name":"Speech perception","level":3,"score":0.3472000062465668},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34150001406669617},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.30379998683929443},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.30239999294281006},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.29899999499320984},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.2980000078678131},{"id":"https://openalex.org/C43617652","wikidata":"https://www.wikidata.org/wiki/Q7575399","display_name":"Speech production","level":2,"score":0.29420000314712524},{"id":"https://openalex.org/C173988693","wikidata":"https://www.wikidata.org/wiki/Q678132","display_name":"Phonation","level":2,"score":0.2696000039577484}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.14172","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.14172","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.14172","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.14172","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.7347378730773926,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Paralinguistic":[0],"and":[1,72,105,129,158],"non-linguistic":[2],"aspects":[3],"of":[4,55,78,156,162],"speech":[5,99,103,164],"strongly":[6],"influence":[7],"listener":[8],"impressions.":[9],"While":[10],"most":[11],"research":[12],"focuses":[13],"on":[14],"absolute":[15],"impression":[16,23],"scoring,":[17],"this":[18,145],"study":[19,150],"investigates":[20],"relative":[21,59],"voice":[22],"estimation":[24,41],"(RIE),":[25],"a":[26,44,79,83],"framework":[27],"for":[28,98,144],"predicting":[29],"the":[30,37,52,56,61,152,160],"perceptual":[31,53,169],"difference":[32],"between":[33],"two":[34],"utterances":[35],"from":[36,48],"same":[38],"speaker.":[39],"The":[40],"target":[42],"is":[43],"low-dimensional":[45],"vector":[46],"derived":[47],"subjective":[49],"evaluations,":[50],"quantifying":[51],"shift":[54],"second":[57],"utterance":[58],"to":[60],"first":[62,153],"along":[63],"an":[64],"antonymic":[65],"axis":[66],"(e.g.,":[67,132],"``Dark--Bright'').":[68],"To":[69],"isolate":[70],"expressive":[71],"prosodic":[73],"variation,":[74],"we":[75],"used":[76,97],"recordings":[77],"professional":[80],"speaker":[81],"reading":[82],"text":[84],"in":[85,126,166],"various":[86],"styles.":[87],"We":[88],"compare":[89],"three":[90],"modeling":[91],"approaches:":[92],"classical":[93,122,135],"acoustic":[94,123],"features":[95,136],"commonly":[96],"emotion":[100],"recognition,":[101],"self-supervised":[102,117,163],"representations,":[104],"multimodal":[106],"large":[107],"language":[108],"models":[109,115,165],"(MLLMs).":[110],"Our":[111],"results":[112],"demonstrate":[113],"that":[114],"using":[116],"representations":[118],"outperform":[119],"methods":[120],"with":[121],"features,":[124],"particularly":[125],"capturing":[127,167],"complex":[128],"dynamic":[130],"impressions":[131],"``Cold--Warm'')":[133],"where":[134],"fail.":[137],"In":[138],"contrast,":[139],"current":[140],"MLLMs":[141],"prove":[142],"unreliable":[143],"fine-grained":[146],"pairwise":[147],"task.":[148],"This":[149],"provides":[151],"systematic":[154],"investigation":[155],"RIE":[157],"demonstrates":[159],"strength":[161],"subtle":[168],"variations.":[170]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-18T00:00:00"}
