{"id":"https://openalex.org/W4400346748","doi":"https://doi.org/10.1088/2632-2153/ad5fde","title":"An extended de Bruijn graph for feature engineering over biological sequential data","display_name":"An extended de Bruijn graph for feature engineering over biological sequential data","publication_year":2024,"publication_date":"2024-07-05","ids":{"openalex":"https://openalex.org/W4400346748","doi":"https://doi.org/10.1088/2632-2153/ad5fde"},"language":"en","primary_location":{"id":"doi:10.1088/2632-2153/ad5fde","is_oa":true,"landing_page_url":"https://doi.org/10.1088/2632-2153/ad5fde","pdf_url":"https://iopscience.iop.org/article/10.1088/2632-2153/ad5fde/pdf","source":{"id":"https://openalex.org/S4210200687","display_name":"Machine Learning Science and Technology","issn_l":"2632-2153","issn":["2632-2153"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320083","host_organization_name":"IOP Publishing","host_organization_lineage":["https://openalex.org/P4310320083","https://openalex.org/P4310311669"],"host_organization_lineage_names":["IOP Publishing","Institute of Physics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning: Science and Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://iopscience.iop.org/article/10.1088/2632-2153/ad5fde/pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100020358","display_name":"Mert Onur Cakiroglu","orcid":"https://orcid.org/0009-0001-0798-1361"},"institutions":[{"id":"https://openalex.org/I4210119109","display_name":"Indiana University Bloomington","ror":"https://ror.org/02k40bc56","country_code":"US","type":"education","lineage":["https://openalex.org/I4210119109","https://openalex.org/I592451"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mert Onur Cakiroglu","raw_affiliation_strings":["Computer Science Department, Indiana University, Bloomington, IN, United States of America"],"raw_orcid":"https://orcid.org/0009-0001-0798-1361","affiliations":[{"raw_affiliation_string":"Computer Science Department, Indiana University, Bloomington, IN, United States of America","institution_ids":["https://openalex.org/I4210119109"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070970331","display_name":"Hasan Kurban","orcid":"https://orcid.org/0000-0003-3142-2866"},"institutions":[{"id":"https://openalex.org/I4210119109","display_name":"Indiana University Bloomington","ror":"https://ror.org/02k40bc56","country_code":"US","type":"education","lineage":["https://openalex.org/I4210119109","https://openalex.org/I592451"]},{"id":"https://openalex.org/I4210144839","display_name":"Hamad bin Khalifa University","ror":"https://ror.org/03eyq4y97","country_code":"QA","type":"education","lineage":["https://openalex.org/I4210144839"]},{"id":"https://openalex.org/I58152225","display_name":"Texas A&M University at Qatar","ror":"https://ror.org/03vb4dm14","country_code":"QA","type":"education","lineage":["https://openalex.org/I58152225","https://openalex.org/I91045830"]}],"countries":["QA","US"],"is_corresponding":true,"raw_author_name":"Hasan Kurban","raw_affiliation_strings":["College of Science and Engineering, Hamad Bin Khalifa University, Doha, Qatar","Computer Science Department, Indiana University, Bloomington, IN, United States of America","Data Science Program, Indiana University, Bloomington, IN, United States of America","Electrical and Computer Engineering, Texas A&M University at Qatar, Doha, Qatar"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"College of Science and Engineering, Hamad Bin Khalifa University, Doha, Qatar","institution_ids":["https://openalex.org/I4210144839"]},{"raw_affiliation_string":"Computer Science Department, Indiana University, Bloomington, IN, United States of America","institution_ids":["https://openalex.org/I4210119109"]},{"raw_affiliation_string":"Data Science Program, Indiana University, Bloomington, IN, United States of America","institution_ids":["https://openalex.org/I4210119109"]},{"raw_affiliation_string":"Electrical and Computer Engineering, Texas A&M University at Qatar, Doha, Qatar","institution_ids":["https://openalex.org/I58152225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040672401","display_name":"Parichit Sharma","orcid":"https://orcid.org/0000-0003-0822-1089"},"institutions":[{"id":"https://openalex.org/I4210119109","display_name":"Indiana University Bloomington","ror":"https://ror.org/02k40bc56","country_code":"US","type":"education","lineage":["https://openalex.org/I4210119109","https://openalex.org/I592451"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Parichit Sharma","raw_affiliation_strings":["Computer Science Department, Indiana University, Bloomington, IN, United States of America"],"raw_orcid":"https://orcid.org/0000-0003-0822-1089","affiliations":[{"raw_affiliation_string":"Computer Science Department, Indiana University, Bloomington, IN, United States of America","institution_ids":["https://openalex.org/I4210119109"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024416450","display_name":"M. O\u011fuzhan K\u00fclek\u00e7i","orcid":"https://orcid.org/0000-0002-4583-6261"},"institutions":[{"id":"https://openalex.org/I4210119109","display_name":"Indiana University Bloomington","ror":"https://ror.org/02k40bc56","country_code":"US","type":"education","lineage":["https://openalex.org/I4210119109","https://openalex.org/I592451"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"M Oguzhan Kulekci","raw_affiliation_strings":["Computer Science Department, Indiana University, Bloomington, IN, United States of America"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Computer Science Department, Indiana University, Bloomington, IN, United States of America","institution_ids":["https://openalex.org/I4210119109"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037076068","display_name":"Elham Khorasani Buxton","orcid":null},"institutions":[{"id":"https://openalex.org/I79884896","display_name":"University of Illinois at Springfield","ror":"https://ror.org/0126qma51","country_code":"US","type":"education","lineage":["https://openalex.org/I79884896"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Elham Khorasani Buxton","raw_affiliation_strings":["Computer Science Department, University of Illinois, Springfield, IL, United States of America"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Computer Science Department, University of Illinois, Springfield, IL, United States of America","institution_ids":["https://openalex.org/I79884896"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073412412","display_name":"Maryam Raeeszadeh\u2010Sarmazdeh","orcid":"https://orcid.org/0000-0003-2946-5584"},"institutions":[{"id":"https://openalex.org/I134113660","display_name":"University of Nevada, Reno","ror":"https://ror.org/01keh0577","country_code":"US","type":"education","lineage":["https://openalex.org/I134113660"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Maryam Raeeszadeh-Sarmazdeh","raw_affiliation_strings":["Chemical and Materials Engineering, University of Nevada, Reno, NV, United States of America"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chemical and Materials Engineering, University of Nevada, Reno, NV, United States of America","institution_ids":["https://openalex.org/I134113660"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5081672389","display_name":"Mehmet Dalk\u0131l\u0131\u00e7","orcid":"https://orcid.org/0000-0002-0776-7365"},"institutions":[{"id":"https://openalex.org/I4210119109","display_name":"Indiana University Bloomington","ror":"https://ror.org/02k40bc56","country_code":"US","type":"education","lineage":["https://openalex.org/I4210119109","https://openalex.org/I592451"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Mehmet M Dalkilic","raw_affiliation_strings":["Computer Science Department, Indiana University, Bloomington, IN, United States of America","Data Science Program, Indiana University, Bloomington, IN, United States of America"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Computer Science Department, Indiana University, Bloomington, IN, United States of America","institution_ids":["https://openalex.org/I4210119109"]},{"raw_affiliation_string":"Data Science Program, Indiana University, Bloomington, IN, United States of America","institution_ids":["https://openalex.org/I4210119109"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5024416450","https://openalex.org/A5070970331","https://openalex.org/A5073412412","https://openalex.org/A5081672389"],"corresponding_institution_ids":["https://openalex.org/I134113660","https://openalex.org/I4210119109","https://openalex.org/I4210144839","https://openalex.org/I58152225"],"apc_list":{"value":1600,"currency":"GBP","value_usd":1962},"apc_paid":{"value":1600,"currency":"GBP","value_usd":1962},"fwci":0.8739,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.72703371,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"5","issue":"3","first_page":"035020","last_page":"035020"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10885","display_name":"Gene expression and cancer classification","score":0.9927999973297119,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10885","display_name":"Gene expression and cancer classification","score":0.9927999973297119,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.9805999994277954,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10057","display_name":"Face and Expression Recognition","score":0.9628999829292297,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/de-bruijn-graph","display_name":"De Bruijn graph","score":0.7111433744430542},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6631186604499817},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.5488505959510803},{"id":"https://openalex.org/keywords/biological-data","display_name":"Biological data","score":0.5222830772399902},{"id":"https://openalex.org/keywords/de-bruijn-sequence","display_name":"De Bruijn sequence","score":0.5175580382347107},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.45946216583251953},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.36684221029281616},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.159801185131073},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.1503070592880249},{"id":"https://openalex.org/keywords/bioinformatics","display_name":"Bioinformatics","score":0.11488372087478638},{"id":"https://openalex.org/keywords/discrete-mathematics","display_name":"Discrete mathematics","score":0.09486699104309082}],"concepts":[{"id":"https://openalex.org/C20218877","wikidata":"https://www.wikidata.org/wiki/Q3066095","display_name":"De Bruijn graph","level":3,"score":0.7111433744430542},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6631186604499817},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.5488505959510803},{"id":"https://openalex.org/C201797286","wikidata":"https://www.wikidata.org/wiki/Q4914986","display_name":"Biological data","level":2,"score":0.5222830772399902},{"id":"https://openalex.org/C170320093","wikidata":"https://www.wikidata.org/wiki/Q1953457","display_name":"De Bruijn sequence","level":2,"score":0.5175580382347107},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.45946216583251953},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.36684221029281616},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.159801185131073},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.1503070592880249},{"id":"https://openalex.org/C60644358","wikidata":"https://www.wikidata.org/wiki/Q128570","display_name":"Bioinformatics","level":1,"score":0.11488372087478638},{"id":"https://openalex.org/C118615104","wikidata":"https://www.wikidata.org/wiki/Q121416","display_name":"Discrete mathematics","level":1,"score":0.09486699104309082},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1088/2632-2153/ad5fde","is_oa":true,"landing_page_url":"https://doi.org/10.1088/2632-2153/ad5fde","pdf_url":"https://iopscience.iop.org/article/10.1088/2632-2153/ad5fde/pdf","source":{"id":"https://openalex.org/S4210200687","display_name":"Machine Learning Science and Technology","issn_l":"2632-2153","issn":["2632-2153"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320083","host_organization_name":"IOP Publishing","host_organization_lineage":["https://openalex.org/P4310320083","https://openalex.org/P4310311669"],"host_organization_lineage_names":["IOP Publishing","Institute of Physics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning: Science and Technology","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:23a128f4de7a4132985c8bd6c201005e","is_oa":true,"landing_page_url":"https://doaj.org/article/23a128f4de7a4132985c8bd6c201005e","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Machine Learning: Science and Technology, Vol 5, Iss 3, p 035020 (2024)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1088/2632-2153/ad5fde","is_oa":true,"landing_page_url":"https://doi.org/10.1088/2632-2153/ad5fde","pdf_url":"https://iopscience.iop.org/article/10.1088/2632-2153/ad5fde/pdf","source":{"id":"https://openalex.org/S4210200687","display_name":"Machine Learning Science and Technology","issn_l":"2632-2153","issn":["2632-2153"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320083","host_organization_name":"IOP Publishing","host_organization_lineage":["https://openalex.org/P4310320083","https://openalex.org/P4310311669"],"host_organization_lineage_names":["IOP Publishing","Institute of Physics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning: Science and Technology","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4400346748.pdf","grobid_xml":"https://content.openalex.org/works/W4400346748.grobid-xml"},"referenced_works_count":50,"referenced_works":["https://openalex.org/W26112474","https://openalex.org/W1560026119","https://openalex.org/W1580213736","https://openalex.org/W1709815505","https://openalex.org/W1822648038","https://openalex.org/W1831050183","https://openalex.org/W1969051510","https://openalex.org/W1980897319","https://openalex.org/W1998300401","https://openalex.org/W2020999234","https://openalex.org/W2028904582","https://openalex.org/W2047630272","https://openalex.org/W2048869777","https://openalex.org/W2050284387","https://openalex.org/W2055043387","https://openalex.org/W2081598613","https://openalex.org/W2087064593","https://openalex.org/W2089783852","https://openalex.org/W2097175728","https://openalex.org/W2116587778","https://openalex.org/W2117190680","https://openalex.org/W2142678478","https://openalex.org/W2143210482","https://openalex.org/W2147274048","https://openalex.org/W2152210981","https://openalex.org/W2155148243","https://openalex.org/W2157952888","https://openalex.org/W2163195825","https://openalex.org/W2397046351","https://openalex.org/W2492295672","https://openalex.org/W2751152975","https://openalex.org/W2761583818","https://openalex.org/W2781086737","https://openalex.org/W2906991531","https://openalex.org/W2918335507","https://openalex.org/W2921491881","https://openalex.org/W2943221422","https://openalex.org/W2943591933","https://openalex.org/W3021927547","https://openalex.org/W3044140833","https://openalex.org/W3049692992","https://openalex.org/W3081019734","https://openalex.org/W3164046276","https://openalex.org/W3167609273","https://openalex.org/W4285326236","https://openalex.org/W4318477680","https://openalex.org/W6634719314","https://openalex.org/W6684238098","https://openalex.org/W6712418502","https://openalex.org/W6762166224"],"related_works":["https://openalex.org/W2562683361","https://openalex.org/W3087469195","https://openalex.org/W2354744388","https://openalex.org/W3125399386","https://openalex.org/W2949121831","https://openalex.org/W1908987670","https://openalex.org/W2756186577","https://openalex.org/W2735897904","https://openalex.org/W3215786367","https://openalex.org/W3021341025"],"abstract_inverted_index":{"Abstract":[0],"In":[1],"this":[2],"study,":[3],"we":[4],"introduce":[5],"a":[6,57,102],"novel":[7],"de":[8],"Bruijn":[9],"graph":[10],"(dBG)":[11],"based":[12],"framework":[13,25,41,117],"for":[14,35,43,84],"feature":[15,27],"engineering":[16],"in":[17,53,131],"biological":[18],"sequential":[19],"data":[20,96],"such":[21],"as":[22],"proteins.":[23],"This":[24],"simplifies":[26],"extraction":[28],"by":[29,47,67],"dynamically":[30],"generating":[31],"high-quality,":[32],"interpretable":[33],"features":[34,83],"traditional":[36],"AI":[37],"(TAI)":[38],"algorithms.":[39,86],"Our":[40],"accounts":[42],"amino":[44],"acid":[45],"substitutions":[46],"efficiently":[48],"adjusting":[49],"the":[50,54,65,69,111],"edge":[51],"weights":[52],"dBG":[55,66],"using":[56],"secondary":[58],"trie":[59],"structure.":[60],"We":[61],"extract":[62],"motifs":[63,123],"from":[64,110],"traversing":[68],"heavy":[70],"edges,":[71],"and":[72,79,108,121],"then":[73],"incorporate":[74],"alignment":[75],"algorithms":[76],"like":[77],"BLAST":[78],"Smith\u2013Waterman":[80],"to":[81],"generate":[82],"TAI":[85],"Empirical":[87],"validation":[88],"on":[89],"TIMP":[90],"(tissue":[91],"inhibitors":[92],"of":[93],"matrix":[94],"metalloproteinase)":[95],"demonstrates":[97],"significant":[98],"accuracy":[99],"improvements":[100],"over":[101],"robust":[103],"baseline,":[104],"state-of-the-art":[105],"PLM":[106],"models,":[107],"those":[109],"popular":[112],"GLAM2":[113],"tool.":[114],"Furthermore,":[115],"our":[116],"successfully":[118],"identified":[119],"Glycine":[120],"Arginine-rich":[122],"with":[124],"high":[125],"coverage,":[126],"highlighting":[127],"it":[128],"is":[129],"potential":[130],"general":[132],"pattern":[133],"discovery.":[134]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2024,"cited_by_count":3}],"updated_date":"2026-05-06T08:25:59.206177","created_date":"2025-10-10T00:00:00"}
