{"id":"https://openalex.org/W2894809609","doi":"https://doi.org/10.1145/3209280.3229103","title":"Helmholtz Principle on word embeddings for automatic document segmentation","display_name":"Helmholtz Principle on word embeddings for automatic document segmentation","publication_year":2018,"publication_date":"2018-08-28","ids":{"openalex":"https://openalex.org/W2894809609","doi":"https://doi.org/10.1145/3209280.3229103","mag":"2894809609"},"language":"en","primary_location":{"id":"doi:10.1145/3209280.3229103","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3209280.3229103","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Symposium on Document Engineering 2018","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5066939448","display_name":"Dominik Krzemi\u0144ski","orcid":"https://orcid.org/0000-0003-4568-0583"},"institutions":[{"id":"https://openalex.org/I79510175","display_name":"Cardiff University","ror":"https://ror.org/03kk7td41","country_code":"GB","type":"education","lineage":["https://openalex.org/I79510175"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Dominik Krzemi\u0144ski","raw_affiliation_strings":["CUBRIC, Cardiff University, United Kingdom"],"affiliations":[{"raw_affiliation_string":"CUBRIC, Cardiff University, United Kingdom","institution_ids":["https://openalex.org/I79510175"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080473462","display_name":"Helen Balinsky","orcid":null},"institutions":[{"id":"https://openalex.org/I4210156325","display_name":"Hewlett-Packard (United Kingdom)","ror":"https://ror.org/05g4mtv59","country_code":"GB","type":"company","lineage":["https://openalex.org/I1324840837","https://openalex.org/I4210156325"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Helen Balinsky","raw_affiliation_strings":["Hewlett-Packard Laboratories, United Kingdom"],"affiliations":[{"raw_affiliation_string":"Hewlett-Packard Laboratories, United Kingdom","institution_ids":["https://openalex.org/I4210156325"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5091025769","display_name":"Alexander Balinsky","orcid":"https://orcid.org/0000-0002-8151-4462"},"institutions":[{"id":"https://openalex.org/I79510175","display_name":"Cardiff University","ror":"https://ror.org/03kk7td41","country_code":"GB","type":"education","lineage":["https://openalex.org/I79510175"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Alexander Balinsky","raw_affiliation_strings":["Cardiff School of Mathematics, Cardiff University, United Kingdom"],"affiliations":[{"raw_affiliation_string":"Cardiff School of Mathematics, Cardiff University, United Kingdom","institution_ids":["https://openalex.org/I79510175"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5066939448"],"corresponding_institution_ids":["https://openalex.org/I79510175"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.10862845,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"4"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8155235052108765},{"id":"https://openalex.org/keywords/word2vec","display_name":"Word2vec","score":0.699979305267334},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6601366400718689},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6495465040206909},{"id":"https://openalex.org/keywords/word-embedding","display_name":"Word embedding","score":0.6454728841781616},{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.5580755472183228},{"id":"https://openalex.org/keywords/text-segmentation","display_name":"Text segmentation","score":0.552523672580719},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.5391481518745422},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5166647434234619},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5086325407028198},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.504377007484436},{"id":"https://openalex.org/keywords/document-classification","display_name":"Document classification","score":0.4631921648979187},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.44408467411994934},{"id":"https://openalex.org/keywords/relation","display_name":"Relation (database)","score":0.4365535378456116},{"id":"https://openalex.org/keywords/categorization","display_name":"Categorization","score":0.4335525929927826},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.18125230073928833},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.08446916937828064}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8155235052108765},{"id":"https://openalex.org/C2776461190","wikidata":"https://www.wikidata.org/wiki/Q22673982","display_name":"Word2vec","level":3,"score":0.699979305267334},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6601366400718689},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6495465040206909},{"id":"https://openalex.org/C2777462759","wikidata":"https://www.wikidata.org/wiki/Q18395344","display_name":"Word embedding","level":3,"score":0.6454728841781616},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.5580755472183228},{"id":"https://openalex.org/C98501671","wikidata":"https://www.wikidata.org/wiki/Q1948408","display_name":"Text segmentation","level":3,"score":0.552523672580719},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.5391481518745422},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5166647434234619},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5086325407028198},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.504377007484436},{"id":"https://openalex.org/C2780479914","wikidata":"https://www.wikidata.org/wiki/Q302088","display_name":"Document classification","level":2,"score":0.4631921648979187},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.44408467411994934},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.4365535378456116},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.4335525929927826},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.18125230073928833},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.08446916937828064},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3209280.3229103","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3209280.3229103","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Symposium on Document Engineering 2018","raw_type":"proceedings-article"},{"id":"pmh:oai:https://orca.cardiff.ac.uk:112497","is_oa":false,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306401195","display_name":"ORCA Online Research @Cardiff (Cardiff University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I79510175","host_organization_name":"Cardiff University","host_organization_lineage":["https://openalex.org/I79510175"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"","raw_type":"Conference or Workshop Item"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7400000095367432,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W1521626219","https://openalex.org/W1533452257","https://openalex.org/W1828401780","https://openalex.org/W2014378927","https://openalex.org/W2065303853","https://openalex.org/W2083821955","https://openalex.org/W2118229299","https://openalex.org/W2250539671","https://openalex.org/W2251803266","https://openalex.org/W2950577311","https://openalex.org/W3012483050","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W2946409105","https://openalex.org/W3152932816","https://openalex.org/W2985392712","https://openalex.org/W4388996947","https://openalex.org/W3133567596","https://openalex.org/W2798009317","https://openalex.org/W3203949288","https://openalex.org/W2580878117","https://openalex.org/W2997627311","https://openalex.org/W2950634454"],"abstract_inverted_index":{"Automatic":[0],"document":[1,49,52,101],"segmentation":[2,102],"gets":[3],"more":[4,6,90],"and":[5,87,130],"attention":[7],"in":[8,44,60],"the":[9,55,105,124],"natural":[10],"language":[11],"processing":[12],"field.":[13],"The":[14],"problem":[15],"is":[16],"defined":[17],"as":[18,73],"text":[19,68,118],"division":[20],"into":[21],"lexically":[22],"coherent":[23],"fragments.":[24],"In":[25],"fact,":[26],"most":[27,125],"of":[28,41,66,83,93,107,110,123,133],"realistic":[29],"documents":[30],"are":[31],"not":[32],"homogeneous,":[33],"so":[34],"extracting":[35],"underlying":[36],"structure":[37],"might":[38],"increase":[39],"performance":[40],"various":[42,67],"algorithms":[43],"problems":[45],"like":[46],"topic":[47],"recognition,":[48],"summarization,":[50],"or":[51,75],"categorization.":[53],"At":[54],"same":[56],"time":[57],"recent":[58],"advances":[59],"word":[61,94,127],"embedding":[62,128],"procedures":[63],"accelerated":[64],"development":[65],"mining":[69],"methods.":[70],"Models":[71],"such":[72],"word2vec,":[74],"GloVe":[76],"allow":[77],"for":[78,117],"efficient":[79],"learning":[80],"a":[81,99,137],"representation":[82],"large":[84],"textual":[85],"datasets":[86],"thus":[88],"introduce":[89],"robust":[91],"measures":[92],"similarities.":[95],"This":[96],"study":[97],"proposes":[98],"new":[100],"algorithm":[103],"combining":[104],"idea":[106],"embedding-based":[108],"measure":[109],"relation":[111],"between":[112],"words":[113],"with":[114],"Helmholtz":[115],"Principle":[116],"mining.":[119],"We":[120],"compare":[121],"two":[122],"common":[126],"models":[129],"show":[131],"improvement":[132],"our":[134],"approach":[135],"on":[136],"benchmark":[138],"dataset.":[139]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
