{"id":"https://openalex.org/W7108715310","doi":"https://doi.org/10.1051/ita/2025020","title":"Efficient <i>k</i> -mer dataset compression using Eulerian covers of de Bruijn graphs and BWT","display_name":"Efficient <i>k</i> -mer dataset compression using Eulerian covers of de Bruijn graphs and BWT","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W7108715310","doi":"https://doi.org/10.1051/ita/2025020"},"language":null,"primary_location":{"id":"doi:10.1051/ita/2025020","is_oa":true,"landing_page_url":"https://doi.org/10.1051/ita/2025020","pdf_url":"https://www.rairo-ita.org/articles/ita/pdf/2025/01/ita250010.pdf","source":{"id":"https://openalex.org/S4220651353","display_name":"RAIRO. Theoretical informatics and applications","issn_l":"2804-7346","issn":["2804-7346"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"RAIRO - Theoretical Informatics and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://www.rairo-ita.org/articles/ita/pdf/2025/01/ita250010.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Herman Z. Q. Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I126924076","display_name":"Chongqing Normal University","ror":"https://ror.org/01dcw5w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I126924076"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Herman Z. Q. Chen","raw_affiliation_strings":["School of Mathematical Sciences; Chongqing Key Lab of Cognitive Intelligence and Intelligent Finance, Chongqing Normal University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Mathematical Sciences; Chongqing Key Lab of Cognitive Intelligence and Intelligent Finance, Chongqing Normal University","institution_ids":["https://openalex.org/I126924076"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Sergey Kitaev","orcid":null},"institutions":[{"id":"https://openalex.org/I181647926","display_name":"University of Strathclyde","ror":"https://ror.org/00n3w3b69","country_code":"GB","type":"education","lineage":["https://openalex.org/I181647926"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Sergey Kitaev","raw_affiliation_strings":["Department of Mathematics and Statistics, University of Strathclyde"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Mathematics and Statistics, University of Strathclyde","institution_ids":["https://openalex.org/I181647926"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xiaoyu Lang","orcid":null},"institutions":[{"id":"https://openalex.org/I126924076","display_name":"Chongqing Normal University","ror":"https://ror.org/01dcw5w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I126924076"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyu Lang","raw_affiliation_strings":["School of Mathematical Sciences, Chongqing Normal University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Mathematical Sciences, Chongqing Normal University","institution_ids":["https://openalex.org/I126924076"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Artem Pyatkin","orcid":null},"institutions":[{"id":"https://openalex.org/I4210096862","display_name":"Sobolev Institute of Mathematics","ror":"https://ror.org/00shc0s02","country_code":"RU","type":"facility","lineage":["https://openalex.org/I1313323035","https://openalex.org/I1313323035","https://openalex.org/I4210096862","https://openalex.org/I4210124601","https://openalex.org/I4210127387"]}],"countries":["RU"],"is_corresponding":false,"raw_author_name":"Artem Pyatkin","raw_affiliation_strings":["Sobolev Institute of Mathematics, Koptyug ave, 4, Novosibirsk 630090, Russia; Novosibirsk State University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Sobolev Institute of Mathematics, Koptyug ave, 4, Novosibirsk 630090, Russia; Novosibirsk State University","institution_ids":["https://openalex.org/I4210096862"]}]},{"author_position":"last","author":{"id":null,"display_name":"Runbin Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I126924076","display_name":"Chongqing Normal University","ror":"https://ror.org/01dcw5w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I126924076"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Runbin Tang","raw_affiliation_strings":["School of Mathematical Sciences, Chongqing Normal University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Mathematical Sciences, Chongqing Normal University","institution_ids":["https://openalex.org/I126924076"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I126924076"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.75932231,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"59","issue":null,"first_page":"20","last_page":"20"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.8335000276565552,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.8335000276565552,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.08659999817609787,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T13664","display_name":"Genome Rearrangement Algorithms","score":0.023000000044703484,"subfield":{"id":"https://openalex.org/subfields/1311","display_name":"Genetics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/lossless-compression","display_name":"Lossless compression","score":0.8801000118255615},{"id":"https://openalex.org/keywords/de-bruijn-sequence","display_name":"De Bruijn sequence","score":0.8618000149726868},{"id":"https://openalex.org/keywords/lossy-compression","display_name":"Lossy compression","score":0.7268000245094299},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.6503000259399414},{"id":"https://openalex.org/keywords/de-bruijn-graph","display_name":"De Bruijn graph","score":0.5663999915122986},{"id":"https://openalex.org/keywords/compression-ratio","display_name":"Compression ratio","score":0.5580000281333923},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5426999926567078},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.5217000246047974},{"id":"https://openalex.org/keywords/compression","display_name":"Compression (physics)","score":0.5095999836921692}],"concepts":[{"id":"https://openalex.org/C81081738","wikidata":"https://www.wikidata.org/wiki/Q55542","display_name":"Lossless compression","level":3,"score":0.8801000118255615},{"id":"https://openalex.org/C170320093","wikidata":"https://www.wikidata.org/wiki/Q1953457","display_name":"De Bruijn sequence","level":2,"score":0.8618000149726868},{"id":"https://openalex.org/C165021410","wikidata":"https://www.wikidata.org/wiki/Q55564","display_name":"Lossy compression","level":2,"score":0.7268000245094299},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.6503000259399414},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6171000003814697},{"id":"https://openalex.org/C20218877","wikidata":"https://www.wikidata.org/wiki/Q3066095","display_name":"De Bruijn graph","level":3,"score":0.5663999915122986},{"id":"https://openalex.org/C25797200","wikidata":"https://www.wikidata.org/wiki/Q828137","display_name":"Compression ratio","level":3,"score":0.5580000281333923},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5447999835014343},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5426999926567078},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.5217000246047974},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.5095999836921692},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.47870001196861267},{"id":"https://openalex.org/C43058520","wikidata":"https://www.wikidata.org/wiki/Q624580","display_name":"Eulerian path","level":3,"score":0.45170000195503235},{"id":"https://openalex.org/C157486923","wikidata":"https://www.wikidata.org/wiki/Q1376436","display_name":"String (physics)","level":2,"score":0.4226999878883362},{"id":"https://openalex.org/C2780428219","wikidata":"https://www.wikidata.org/wiki/Q16952335","display_name":"Cover (algebra)","level":2,"score":0.39250001311302185},{"id":"https://openalex.org/C311688","wikidata":"https://www.wikidata.org/wiki/Q2393193","display_name":"Time complexity","level":2,"score":0.38920000195503235},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.37880000472068787},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.3707999885082245},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.35510000586509705},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.33379998803138733},{"id":"https://openalex.org/C94835093","wikidata":"https://www.wikidata.org/wiki/Q3113333","display_name":"Data compression ratio","level":5,"score":0.3149000108242035},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.31290000677108765},{"id":"https://openalex.org/C177860922","wikidata":"https://www.wikidata.org/wiki/Q788608","display_name":"Decorrelation","level":2,"score":0.31209999322891235},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.29980000853538513},{"id":"https://openalex.org/C2776809875","wikidata":"https://www.wikidata.org/wiki/Q1375963","display_name":"Converse","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2676999866962433},{"id":"https://openalex.org/C101722063","wikidata":"https://www.wikidata.org/wiki/Q218825","display_name":"Random access","level":2,"score":0.258899986743927}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1051/ita/2025020","is_oa":true,"landing_page_url":"https://doi.org/10.1051/ita/2025020","pdf_url":"https://www.rairo-ita.org/articles/ita/pdf/2025/01/ita250010.pdf","source":{"id":"https://openalex.org/S4220651353","display_name":"RAIRO. Theoretical informatics and applications","issn_l":"2804-7346","issn":["2804-7346"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"RAIRO - Theoretical Informatics and Applications","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1051/ita/2025020","is_oa":true,"landing_page_url":"https://doi.org/10.1051/ita/2025020","pdf_url":"https://www.rairo-ita.org/articles/ita/pdf/2025/01/ita250010.pdf","source":{"id":"https://openalex.org/S4220651353","display_name":"RAIRO. Theoretical informatics and applications","issn_l":"2804-7346","issn":["2804-7346"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"RAIRO - Theoretical Informatics and Applications","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.4460061192512512,"id":"https://metadata.un.org/sdg/9"}],"awards":[{"id":"https://openalex.org/G7186543067","display_name":null,"funder_award_id":"FWNF-2022-0019","funder_id":"https://openalex.org/F4320337033","funder_display_name":"Sobolev Institute of Mathematics, Siberian Branch, Russian Academy of Sciences"}],"funders":[{"id":"https://openalex.org/F4320311970","display_name":"Chongqing Normal University","ror":"https://ror.org/01dcw5w74"},{"id":"https://openalex.org/F4320322725","display_name":"China Scholarship Council","ror":"https://ror.org/04atp4p48"},{"id":"https://openalex.org/F4320323172","display_name":"Natural Science Foundation of Chongqing","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320327639","display_name":"Centre Scientifique et Technique du B\u00e2timent","ror":"https://ror.org/02fsd1928"},{"id":"https://openalex.org/F4320337033","display_name":"Sobolev Institute of Mathematics, Siberian Branch, Russian Academy of Sciences","ror":"https://ror.org/00shc0s02"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7108715310.pdf","grobid_xml":"https://content.openalex.org/works/W7108715310.grobid-xml"},"referenced_works_count":9,"referenced_works":["https://openalex.org/W1980182810","https://openalex.org/W2011699809","https://openalex.org/W3150463527","https://openalex.org/W3175899557","https://openalex.org/W4250968050","https://openalex.org/W4281388987","https://openalex.org/W4380084687","https://openalex.org/W4383186537","https://openalex.org/W4387425043"],"related_works":[],"abstract_inverted_index":{"Transforming":[0],"an":[1,54,69],"input":[2,156],"sequence":[3,157,193,284],"into":[4],"its":[5,191],"constituent":[6],"k":[7,22,40,109,276],"-mers":[8],"is":[9,93,154],"a":[10,31,47,262],"fundamental":[11],"operation":[12],"in":[13],"computational":[14],"genomics.":[15],"To":[16],"reduce":[17],"storage":[18,272],"costs":[19],"associated":[20],"with":[21],"-mer":[23,41,110,277],"datasets,":[24],"we":[25,120],"introduce":[26],"and":[27,99,102,116,125,143,273],"formally":[28],"analyze":[29],"MCTR,":[30],"novel":[32],"two-stage":[33],"algorithm":[34],"for":[35,190,253,267,283],"lossless":[36,170,255,271],"compression":[37,146,232],"of":[38,61,106,275],"the":[39,62,83,107,129,144,155,245],"multiset.":[42],"Our":[43,257],"core":[44,140],"method":[45],"achieves":[46],"minimal":[48,217],"text":[49,219],"representation":[50],"(\ud835\udd4e)":[51],"by":[52,68],"computing":[53],"optimal":[55],"Eulerian":[56],"cover":[57],"(minimum":[58],"string":[59],"count)":[60],"dataset's":[63],"de":[64,88],"Bruijn":[65,89],"graph,":[66],"enabled":[67],"efficient":[70],"local":[71],"Eulerization":[72],"technique.":[73],"The":[74],"resulting":[75],"strings":[76],"are":[77],"then":[78],"further":[79],"compressed":[80],"losslessly":[81],"using":[82],"Burrows-Wheeler":[84],"Transform":[85],"(BWT).":[86],"Leveraging":[87],"graph":[90],"properties,":[91],"MCTR":[92,195,227,237,260],"proven":[94],"to":[95,169,209],"achieve":[96],"linear":[97],"time":[98,142],"space":[100],"complexity":[101],"guarantees":[103],"complete":[104],"reconstruction":[105],"original":[108],"multiset,":[111],"including":[112],"frequencies.":[113],"Using":[114],"simulated":[115,214],"real":[117,188,225],"genomic":[118],"data,":[119,215,226],"evaluated":[121],"MCTR's":[122,161],"performance":[123,166],"(list":[124],"frequency":[126],"representations)":[127],"against":[128],"state-of-the-art":[130],"lossy":[131,192,280],"unitigging":[132],"tool":[133,266],"greedytigs":[134],"(from":[135],"matchtigs":[136],").":[137],"We":[138],"measured":[139],"execution":[141],"raw":[145,177,218,231],"ratio":[147],"(cr":[148,183,233,241],"=":[149],"weight(\ud835\udd44)/":[150],"weight(\ud835\udd4e),":[151],"where":[152],"\ud835\udd44":[153],"data).":[158],"Benchmarks":[159],"confirmed":[160],"data":[162,189],"fidelity":[163],"but":[164],"revealed":[165],"trade-offs":[167],"inherent":[168],"representation.":[171],"GreedyTigs":[172,179],"was":[173],"significantly":[174,249],"faster.":[175],"Regarding":[176],"compression,":[178],"achieved":[180],"high":[181],"ratios":[182],"\u2248":[184,199,234,242],"14)":[185],"on":[186,212],"noisy":[187],"output.":[194],"methods":[196,281],"exhibited":[197],"cr":[198,204],"1":[200,206],"(list)":[201,238],"or":[202,221],"even":[203,222],"&lt;":[205],"(frequency,":[207],"due":[208],"count":[210],"overhead)":[211],"clean":[213],"indicating":[216],"reduction":[220],"expansion.":[223],"On":[224],"(frequency)":[228],"showed":[229,239],"moderate":[230],"1.5\u20132.7),":[235],"while":[236],"none":[240],"1).":[243],"Importantly,":[244],"full":[246],"MCTR+BWT":[247],"pipeline":[248],"outperforms":[250],"BWT":[251],"alone":[252],"enhanced":[254],"compression.":[256],"results":[258],"establish":[259],"as":[261],"valuable,":[263],"theoretically":[264],"grounded":[265],"applications":[268],"demanding":[269],"efficient,":[270],"analysis":[274],"multisets,":[278],"complementing":[279],"optimized":[282],"summarization.":[285]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-12-05T00:00:00"}
