{"id":"https://openalex.org/W7148371136","doi":"https://doi.org/10.48550/arxiv.2604.00023","title":"Phonological Fossils: Machine Learning Detection of Non-Mainstream Vocabulary in Sulawesi Basic Lexicon","display_name":"Phonological Fossils: Machine Learning Detection of Non-Mainstream Vocabulary in Sulawesi Basic Lexicon","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7148371136","doi":"https://doi.org/10.48550/arxiv.2604.00023"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00023","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00023","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00023","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132764591","display_name":"Mukhlis Amien","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Amien, Mukhlis","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5005886095","display_name":"Go Frendi Gunawan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gunawan, Go Frendi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5132764591"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.3025999963283539,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.3025999963283539,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10034","display_name":"Syntax, Semantics, Linguistic Variation","score":0.10419999808073044,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.06109999865293503,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cognate","display_name":"Cognate","score":0.8180000185966492},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.5979999899864197},{"id":"https://openalex.org/keywords/lexicon","display_name":"Lexicon","score":0.5934000015258789},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.5270000100135803},{"id":"https://openalex.org/keywords/phonology","display_name":"Phonology","score":0.46239998936653137},{"id":"https://openalex.org/keywords/language-family","display_name":"Language family","score":0.4401000142097473},{"id":"https://openalex.org/keywords/complement","display_name":"Complement (music)","score":0.4129999876022339}],"concepts":[{"id":"https://openalex.org/C2777392089","wikidata":"https://www.wikidata.org/wiki/Q690548","display_name":"Cognate","level":2,"score":0.8180000185966492},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6154999732971191},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.61080002784729},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.5979999899864197},{"id":"https://openalex.org/C2778121359","wikidata":"https://www.wikidata.org/wiki/Q8096","display_name":"Lexicon","level":2,"score":0.5934000015258789},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5292999744415283},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.5270000100135803},{"id":"https://openalex.org/C148934300","wikidata":"https://www.wikidata.org/wiki/Q40998","display_name":"Phonology","level":2,"score":0.46239998936653137},{"id":"https://openalex.org/C2780566098","wikidata":"https://www.wikidata.org/wiki/Q25295","display_name":"Language family","level":2,"score":0.4401000142097473},{"id":"https://openalex.org/C112313634","wikidata":"https://www.wikidata.org/wiki/Q7886648","display_name":"Complement (music)","level":5,"score":0.4129999876022339},{"id":"https://openalex.org/C2779207338","wikidata":"https://www.wikidata.org/wiki/Q9240","display_name":"Indonesian","level":2,"score":0.3691999912261963},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.34459999203681946},{"id":"https://openalex.org/C2779065236","wikidata":"https://www.wikidata.org/wiki/Q49228","display_name":"Austronesian languages","level":2,"score":0.3433000147342682},{"id":"https://openalex.org/C171041071","wikidata":"https://www.wikidata.org/wiki/Q36870","display_name":"First language","level":2,"score":0.31869998574256897},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3037000000476837},{"id":"https://openalex.org/C2987567764","wikidata":"https://www.wikidata.org/wiki/Q125421","display_name":"Second language","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C2778203577","wikidata":"https://www.wikidata.org/wiki/Q38035","display_name":"Consonant","level":3,"score":0.2879999876022339},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2648000121116638}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00023","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00023","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00023","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00023","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6534759402275085,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Basic":[0,61],"vocabulary":[1,25],"in":[2,58,173],"many":[3],"Sulawesi":[4,56,147],"Austronesian":[5,60,107],"languages":[6,57,143,148,159],"includes":[7],"forms":[8,53,69,89],"resisting":[9],"reconstruction":[10],"to":[11,140],"any":[12],"proto-form":[13],"with":[14,18,42,90],"phonological":[15,49,83,94,165,182],"patterns":[16],"inconsistent":[17],"inherited":[19,86],"roots,":[20],"but":[21],"whether":[22],"this":[23],"non-conforming":[24],"represents":[26],"pre-Austronesian":[27,136],"substrate":[28,68,189],"or":[29],"independent":[30],"innovation":[31],"has":[32],"not":[33],"been":[34],"tested":[35],"computationally.":[36],"We":[37],"combine":[38],"rule-based":[39],"cognate":[40,72,127],"subtraction":[41,73],"a":[43,93,134,187],"machine":[44,166],"learning":[45,167],"classifier":[46,79],"trained":[47,80],"on":[48,81],"features.":[50],"Using":[51],"1,357":[52],"from":[54,87],"six":[55],"the":[59],"Vocabulary":[62],"Database,":[63],"we":[64],"identify":[65],"438":[66],"candidate":[67],"(26.5%)":[70],"through":[71],"and":[74,105],"Proto-Austronesian":[75],"cross-checking.":[76],"An":[77],"XGBoost":[78],"26":[82],"features":[84],"distinguishes":[85],"non-mainstream":[88,116,152,175],"AUC=0.763,":[91],"revealing":[92],"fingerprint:":[95],"longer":[96],"forms,":[97],"more":[98],"consonant":[99],"clusters,":[100],"higher":[101,150],"glottal":[102],"stop":[103],"rates,":[104],"fewer":[106],"prefixes.":[108],"Cross-method":[109],"consensus":[110],"(Cohen's":[111],"kappa=0.61)":[112],"identifies":[113],"266":[114],"high-confidence":[115],"candidates.":[117],"However,":[118],"clustering":[119],"yields":[120],"no":[121,131],"coherent":[122],"word":[123],"families":[124],"(silhouette=0.114;":[125],"cross-linguistic":[126],"test":[128],"p=0.569),":[129],"providing":[130],"evidence":[132,185],"for":[133,186],"single":[135],"language":[137],"layer.":[138],"Application":[139],"16":[141],"additional":[142],"confirms":[144],"geographic":[145],"patterning:":[146],"show":[149],"predicted":[151],"rates":[153],"(mean":[154],"P_sub=0.606)":[155],"than":[156],"Western":[157],"Indonesian":[158],"(0.393).":[160],"This":[161],"study":[162],"demonstrates":[163],"that":[164],"can":[168],"complement":[169],"traditional":[170],"comparative":[171],"methods":[172],"detecting":[174],"lexical":[176],"layers,":[177],"while":[178],"cautioning":[179],"against":[180],"interpreting":[181],"non-conformity":[183],"as":[184],"shared":[188],"language.":[190]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
