{"id":"https://openalex.org/W7079645476","doi":"https://doi.org/10.48550/arxiv.2509.00990","title":"Hybrid Topic-Semantic Labeling and Graph Embeddings for Unsupervised Legal Document Clustering","display_name":"Hybrid Topic-Semantic Labeling and Graph Embeddings for Unsupervised Legal Document Clustering","publication_year":2025,"publication_date":"2025-08-31","ids":{"openalex":"https://openalex.org/W7079645476","doi":"https://doi.org/10.48550/arxiv.2509.00990"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2509.00990","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.00990","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2509.00990","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Bastola, Deepak","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bastola, Deepak","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Choi, Woohyeok","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Choi, Woohyeok","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.4422999918460846,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.4422999918460846,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13177","display_name":"Geological and Geophysical Studies","score":0.03779999911785126,"subfield":{"id":"https://openalex.org/subfields/1907","display_name":"Geology"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13067","display_name":"Geological Modeling and Analysis","score":0.030799999833106995,"subfield":{"id":"https://openalex.org/subfields/1906","display_name":"Geochemistry and Petrology"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5394999980926514},{"id":"https://openalex.org/keywords/matrix-decomposition","display_name":"Matrix decomposition","score":0.48579999804496765},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4528999924659729},{"id":"https://openalex.org/keywords/latent-dirichlet-allocation","display_name":"Latent Dirichlet allocation","score":0.44429999589920044},{"id":"https://openalex.org/keywords/unsupervised-learning","display_name":"Unsupervised learning","score":0.42890000343322754},{"id":"https://openalex.org/keywords/document-clustering","display_name":"Document clustering","score":0.4146000146865845},{"id":"https://openalex.org/keywords/word-embedding","display_name":"Word embedding","score":0.4140999913215637},{"id":"https://openalex.org/keywords/dimensionality-reduction","display_name":"Dimensionality reduction","score":0.3977999985218048},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.3815000057220459}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.746399998664856},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5692999958992004},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5394999980926514},{"id":"https://openalex.org/C42355184","wikidata":"https://www.wikidata.org/wiki/Q1361088","display_name":"Matrix decomposition","level":3,"score":0.48579999804496765},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.48010000586509705},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4528999924659729},{"id":"https://openalex.org/C500882744","wikidata":"https://www.wikidata.org/wiki/Q269236","display_name":"Latent Dirichlet allocation","level":3,"score":0.44429999589920044},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.42890000343322754},{"id":"https://openalex.org/C177937566","wikidata":"https://www.wikidata.org/wiki/Q4223102","display_name":"Document clustering","level":3,"score":0.4146000146865845},{"id":"https://openalex.org/C2777462759","wikidata":"https://www.wikidata.org/wiki/Q18395344","display_name":"Word embedding","level":3,"score":0.4140999913215637},{"id":"https://openalex.org/C70518039","wikidata":"https://www.wikidata.org/wiki/Q16000077","display_name":"Dimensionality reduction","level":2,"score":0.3977999985218048},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.38280001282691956},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.3815000057220459},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.3573000133037567},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.35370001196861267},{"id":"https://openalex.org/C105611402","wikidata":"https://www.wikidata.org/wiki/Q2976589","display_name":"Spectral clustering","level":3,"score":0.35190001130104065},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.33820000290870667},{"id":"https://openalex.org/C75564084","wikidata":"https://www.wikidata.org/wiki/Q5597085","display_name":"Graph embedding","level":3,"score":0.32010000944137573},{"id":"https://openalex.org/C115178988","wikidata":"https://www.wikidata.org/wiki/Q772067","display_name":"Laplacian matrix","level":3,"score":0.2969000041484833},{"id":"https://openalex.org/C197657726","wikidata":"https://www.wikidata.org/wiki/Q174733","display_name":"Bipartite graph","level":3,"score":0.2939999997615814},{"id":"https://openalex.org/C186625053","wikidata":"https://www.wikidata.org/wiki/Q1130191","display_name":"Information overload","level":2,"score":0.28859999775886536},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.2815000116825104},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.2809999883174896},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2782000005245209},{"id":"https://openalex.org/C81669768","wikidata":"https://www.wikidata.org/wiki/Q2359161","display_name":"Precision and recall","level":2,"score":0.26420000195503235},{"id":"https://openalex.org/C111168008","wikidata":"https://www.wikidata.org/wiki/Q1136838","display_name":"Self-organizing map","level":3,"score":0.25200000405311584},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.25110000371932983},{"id":"https://openalex.org/C2780479914","wikidata":"https://www.wikidata.org/wiki/Q302088","display_name":"Document classification","level":2,"score":0.2506999969482422},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2509.00990","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.00990","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2509.00990","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.00990","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.7047046422958374}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Legal":[0],"documents":[1],"pose":[2],"unique":[3],"challenges":[4],"for":[5,24,183,199,221,241],"text":[6],"classification":[7],"due":[8],"to":[9,42,54,149,211,230],"their":[10],"domain-specific":[11,193,239],"language":[12],"and":[13,32,47,52,69,112,118,132,174,206,215,226,238],"often":[14],"limited":[15],"labeled":[16],"data.":[17],"This":[18],"paper":[19],"proposes":[20],"a":[21,36,59,81,101,228],"hybrid":[22],"approach":[23,90,148],"classifying":[25],"legal":[26,63,82,151,185,213,223,243],"texts":[27],"by":[28,154],"combining":[29,155],"unsupervised":[30,150],"topic":[31,157,172],"graph":[33,61,160],"embeddings":[34,46,66],"with":[35,159],"supervised":[37,231],"model.":[38],"We":[39,99],"employ":[40],"Top2Vec":[41],"learn":[43],"semantic":[44,156],"document":[45,83,152],"automatically":[48],"discover":[49],"latent":[50],"topics,":[51],"Node2Vec":[53],"capture":[55],"structural":[56],"relationships":[57],"via":[58],"bipartite":[60],"of":[62,76,104,110,115,170,178,192,203],"documents.":[64,77],"The":[65,217],"are":[67],"combined":[68,88],"clustered":[70],"using":[71],"KMeans,":[72],"yielding":[73],"coherent":[74],"groupings":[75],"Our":[78],"computations":[79],"on":[80],"dataset":[84],"demonstrate":[85,119],"that":[86,120,141],"the":[87,108,113,116,143,168,175,179,190],"Top2Vec+Node2Vec":[89],"improves":[91],"clustering":[92],"quality":[93,169],"over":[94],"text-only":[95],"or":[96],"graph-only":[97],"embeddings.":[98],"conduct":[100],"sensitivity":[102],"analysis":[103,153,225],"hyperparameters,":[105],"such":[106],"as":[107,227],"number":[109],"clusters":[111],"dimensionality":[114],"embeddings,":[117,194],"our":[121],"method":[122],"achieves":[123],"competitive":[124],"performance":[125],"against":[126],"baseline":[127],"Latent":[128],"Dirichlet":[129],"Allocation":[130],"(LDA)":[131],"Non-Negative":[133],"Matrix":[134],"Factorization":[135],"(NMF)":[136],"models.":[137],"Key":[138],"findings":[139],"indicate":[140],"while":[142],"pipeline":[144,218],"presents":[145],"an":[146],"innovative":[147],"modeling":[158],"embedding":[161,181],"techniques,":[162],"its":[163],"efficacy":[164],"is":[165],"contingent":[166],"upon":[167],"initial":[171],"generation":[173],"representational":[176],"power":[177],"chosen":[180],"models":[182],"specialized":[184],"language.":[186],"Strategic":[187],"recommendations":[188],"include":[189],"exploration":[191],"more":[195],"comprehensive":[196],"hyperparameter":[197],"tuning":[198],"Node2Vec,":[200],"dynamic":[201],"determination":[202],"cluster":[204],"numbers,":[205],"robust":[207],"human-in-the-loop":[208],"validation":[209],"processes":[210],"enhance":[212],"relevance":[214],"trustworthiness.":[216],"demonstrates":[219],"potential":[220],"exploratory":[222],"data":[224],"precursor":[229],"learning":[232],"tasks":[233],"but":[234],"requires":[235],"further":[236],"refinement":[237],"adaptation":[240],"practical":[242],"applications.":[244]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
