{"id":"https://openalex.org/W7154936604","doi":"https://doi.org/10.48550/arxiv.2604.15675","title":"C-Mining: Unsupervised Discovery of Seeds for Cultural Data Synthesis via Geometric Misalignment","display_name":"C-Mining: Unsupervised Discovery of Seeds for Cultural Data Synthesis via Geometric Misalignment","publication_year":2026,"publication_date":"2026-04-17","ids":{"openalex":"https://openalex.org/W7154936604","doi":"https://doi.org/10.48550/arxiv.2604.15675"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.15675","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15675","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.15675","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060490057","display_name":"Pufan Zeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Pufan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134033341","display_name":"Yilun Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yilun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134029535","display_name":"Mingchen Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Mingchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134025294","display_name":"Mengyao Piao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Piao, Mengyao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134044289","display_name":"Chunguang Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Chunguang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134047673","display_name":"Lingqi Miao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Miao, Lingqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134075278","display_name":"Shimin Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao, Shimin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027448460","display_name":"Weibin Meng","orcid":"https://orcid.org/0000-0002-9384-9016"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meng, Weibin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108150655","display_name":"M. He","orcid":"https://orcid.org/0009-0002-0589-4966"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Minggui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134049190","display_name":"Chenxin Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Chenxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134046313","display_name":"Zhenzhen Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Zhenzhen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134082885","display_name":"Li Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002727480","display_name":"Hongxia Ma","orcid":"https://orcid.org/0000-0002-2462-9693"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Hongxia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134028603","display_name":"Boxing Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Boxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124626903","display_name":"Daimeng Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Daimeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.296099990606308,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.296099990606308,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.09430000185966492,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.055399999022483826,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6779999732971191},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.6014999747276306},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5848000049591064},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5324000120162964},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.4672999978065491},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.44279998540878296}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.691100001335144},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6779999732971191},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.6014999747276306},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5848000049591064},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5324000120162964},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5278000235557556},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.4672999978065491},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.44279998540878296},{"id":"https://openalex.org/C132964779","wikidata":"https://www.wikidata.org/wiki/Q2110223","display_name":"Raw data","level":2,"score":0.4424999952316284},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.387800008058548},{"id":"https://openalex.org/C120567893","wikidata":"https://www.wikidata.org/wiki/Q1582085","display_name":"Knowledge extraction","level":2,"score":0.3718999922275543},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.34290000796318054},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3395000100135803},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3107999861240387},{"id":"https://openalex.org/C2994055011","wikidata":"https://www.wikidata.org/wiki/Q210272","display_name":"Cultural knowledge","level":2,"score":0.26759999990463257}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.15675","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15675","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.15675","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15675","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"score":0.7613325715065002,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Achieving":[0],"cultural":[1,47,77,102,179,202],"alignment":[2],"in":[3],"Large":[4],"Language":[5],"Models":[6],"(LLMs)":[7],"increasingly":[8],"depends":[9],"on":[10,38,144,189],"synthetic":[11],"data":[12,87,203],"generation.":[13],"For":[14],"such":[15],"synthesis,":[16],"the":[17,74,98,159,164],"most":[18],"vital":[19],"initial":[20],"step":[21],"is":[22],"seed":[23],"curation;":[24],"however,":[25],"current":[26],"methods":[27],"lack":[28],"quantifiable":[29,110,198],"standards":[30],"for":[31,200],"selecting":[32],"these":[33,116],"seeds.":[34],"Existing":[35],"approaches":[36],"rely":[37],"unscalable":[39],"manual":[40],"curation":[41],"or":[42,146],"bias-prone":[43],"LLM":[44,147],"extraction,":[45],"treating":[46],"specificity":[48],"as":[49,108],"an":[50,69],"abstract":[51],"concept":[52],"rather":[53],"than":[54,154],"a":[55,80,85,93,109,185,196],"measurable":[56],"signal.":[57,112],"In":[58],"this":[59,63,174],"paper,":[60],"we":[61],"address":[62],"\"quantification":[64],"gap\"":[65],"by":[66,119,152],"proposing":[67],"C-Mining,":[68],"unsupervised":[70],"framework":[71],"that":[72,173],"transforms":[73],"discovery":[75,111],"of":[76,101,166],"seeds":[78],"from":[79,138],"subjective":[81],"selection":[82],"process":[83],"into":[84],"computable":[86],"mining":[88],"formulation.":[89],"Our":[90],"approach":[91,176],"exploits":[92],"novel":[94],"geometric":[95,124],"insight,":[96],"leveraging":[97],"cross-lingual":[99],"misalignment":[100],"concepts":[103],"within":[104],"pre-trained":[105],"embedding":[106],"spaces":[107],"By":[113],"systematically":[114],"identifying":[115],"regions":[117],"characterized":[118],"pronounced":[120],"linguistic":[121],"exclusivity":[122],"and":[123,181,191],"isolation,":[125],"while":[126],"actively":[127],"filtering":[128],"out":[129],"noise,":[130],"C-Mining":[131],"automatically":[132],"extracts":[133],"high-fidelity":[134],"Culture":[135],"Points":[136],"(CPs)":[137],"raw":[139],"multilingual":[140],"corpora":[141],"without":[142],"reliance":[143],"human":[145],"supervision,":[148],"reducing":[149],"preparation":[150],"costs":[151],"more":[153],"150-fold.":[155],"We":[156],"further":[157],"leverage":[158],"mined":[160],"knowledge":[161],"to":[162],"steer":[163],"synthesis":[165],"diverse":[167],"instruction-tuning":[168],"datasets.":[169],"Extensive":[170],"experiments":[171],"demonstrate":[172],"seed-centric":[175],"significantly":[177],"enhances":[178],"understanding":[180],"reasoning":[182],"capabilities,":[183],"achieving":[184],"+6.03":[186],"point":[187],"improvement":[188],"CulturalBench-Hard":[190],"surpassing":[192],"state-of-the-art":[193],"baselines,":[194],"providing":[195],"scalable,":[197],"solution":[199],"high-quality":[201],"synthesis.":[204]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-04-21T00:00:00"}
