{"id":"https://openalex.org/W4410241805","doi":"https://doi.org/10.1142/s0219720025500064","title":"M<sup>3</sup>-20M: A large-scale multi-modal molecule dataset for AI-driven drug design and discovery","display_name":"M<sup>3</sup>-20M: A large-scale multi-modal molecule dataset for AI-driven drug design and discovery","publication_year":2025,"publication_date":"2025-04-01","ids":{"openalex":"https://openalex.org/W4410241805","doi":"https://doi.org/10.1142/s0219720025500064","pmid":"https://pubmed.ncbi.nlm.nih.gov/40494666"},"language":"en","primary_location":{"id":"doi:10.1142/s0219720025500064","is_oa":false,"landing_page_url":"https://doi.org/10.1142/s0219720025500064","pdf_url":null,"source":{"id":"https://openalex.org/S155349577","display_name":"Journal of Bioinformatics and Computational Biology","issn_l":"0219-7200","issn":["0219-7200","1757-6334"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311754","host_organization_name":"Imperial College Press","host_organization_lineage":["https://openalex.org/P4310311754"],"host_organization_lineage_names":["Imperial College Press"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Bioinformatics and Computational Biology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064450823","display_name":"Siyuan Guo","orcid":"https://orcid.org/0000-0003-0378-830X"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Siyuan Guo","raw_affiliation_strings":["Department of Computer Science and Technology, Tongji University, No. 4800 Cao\u2019an Road, Shanghai 201804, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tongji University, No. 4800 Cao\u2019an Road, Shanghai 201804, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017327317","display_name":"Lexuan Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lexuan Wang","raw_affiliation_strings":["Department of Computer Science and Technology, Tongji University, No. 4800 Cao\u2019an Road, Shanghai 201804, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tongji University, No. 4800 Cao\u2019an Road, Shanghai 201804, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056822499","display_name":"Chang Jin","orcid":null},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chang Jin","raw_affiliation_strings":["Department of Computer Science and Technology, Tongji University, No. 4800 Cao\u2019an Road, Shanghai 201804, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tongji University, No. 4800 Cao\u2019an Road, Shanghai 201804, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101969766","display_name":"Jinxian Wang","orcid":"https://orcid.org/0000-0002-0432-2811"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinxian Wang","raw_affiliation_strings":["Shanghai Key Lab of Intelligent Information Processing and School of Computer Science, Fudan University, 2005 Songhu Road, Shanghai 200438, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Key Lab of Intelligent Information Processing and School of Computer Science, Fudan University, 2005 Songhu Road, Shanghai 200438, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100454716","display_name":"Peng Han","orcid":"https://orcid.org/0000-0002-2505-5601"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Han Peng","raw_affiliation_strings":["Shanghai Key Lab of Intelligent Information Processing and School of Computer Science, Fudan University, 2005 Songhu Road, Shanghai 200438, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Key Lab of Intelligent Information Processing and School of Computer Science, Fudan University, 2005 Songhu Road, Shanghai 200438, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107911595","display_name":"H. C. Shi","orcid":"https://orcid.org/0009-0000-8669-7184"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huayang Shi","raw_affiliation_strings":["Shanghai Key Lab of Intelligent Information Processing and School of Computer Science, Fudan University, 2005 Songhu Road, Shanghai 200438, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Key Lab of Intelligent Information Processing and School of Computer Science, Fudan University, 2005 Songhu Road, Shanghai 200438, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023712282","display_name":"Wengen Li","orcid":"https://orcid.org/0000-0002-8768-6740"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wengen Li","raw_affiliation_strings":["Department of Computer Science and Technology, Tongji University, No. 4800 Cao\u2019an Road, Shanghai 201804, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tongji University, No. 4800 Cao\u2019an Road, Shanghai 201804, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086316879","display_name":"Jihong Guan","orcid":"https://orcid.org/0000-0003-2313-7635"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jihong Guan","raw_affiliation_strings":["Department of Computer Science and Technology, Tongji University, No. 4800 Cao\u2019an Road, Shanghai 201804, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tongji University, No. 4800 Cao\u2019an Road, Shanghai 201804, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017862559","display_name":"Shuigeng Zhou","orcid":"https://orcid.org/0000-0002-1949-2768"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuigeng Zhou","raw_affiliation_strings":["Shanghai Key Lab of Intelligent Information Processing and School of Computer Science, Fudan University, 2005 Songhu Road, Shanghai 200438, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Key Lab of Intelligent Information Processing and School of Computer Science, Fudan University, 2005 Songhu Road, Shanghai 200438, China","institution_ids":["https://openalex.org/I24943067"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5064450823"],"corresponding_institution_ids":["https://openalex.org/I116953780"],"apc_list":null,"apc_paid":null,"fwci":3.0167,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.91130684,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":"23","issue":"02","first_page":"2550006","last_page":"2550006"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13937","display_name":"Genetics, Bioinformatics, and Biomedical Research","score":0.9093999862670898,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5931739211082458},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5227599143981934},{"id":"https://openalex.org/keywords/drug-discovery","display_name":"Drug discovery","score":0.48751017451286316},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4865214228630066},{"id":"https://openalex.org/keywords/computational-biology","display_name":"Computational biology","score":0.37177345156669617},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.36306339502334595},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3532868027687073},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.1921256184577942},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.1893754005432129},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.18623751401901245},{"id":"https://openalex.org/keywords/bioinformatics","display_name":"Bioinformatics","score":0.1386547088623047}],"concepts":[{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5931739211082458},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5227599143981934},{"id":"https://openalex.org/C74187038","wikidata":"https://www.wikidata.org/wiki/Q1418791","display_name":"Drug discovery","level":2,"score":0.48751017451286316},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4865214228630066},{"id":"https://openalex.org/C70721500","wikidata":"https://www.wikidata.org/wiki/Q177005","display_name":"Computational biology","level":1,"score":0.37177345156669617},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36306339502334595},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3532868027687073},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.1921256184577942},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.1893754005432129},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.18623751401901245},{"id":"https://openalex.org/C60644358","wikidata":"https://www.wikidata.org/wiki/Q128570","display_name":"Bioinformatics","level":1,"score":0.1386547088623047},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[{"descriptor_ui":"D001185","descriptor_name":"Artificial Intelligence","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D001185","descriptor_name":"Artificial Intelligence","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D001185","descriptor_name":"Artificial Intelligence","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D015195","descriptor_name":"Drug Design","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D015195","descriptor_name":"Drug Design","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D015195","descriptor_name":"Drug Design","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D016208","descriptor_name":"Databases, Factual","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D016208","descriptor_name":"Databases, Factual","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D016208","descriptor_name":"Databases, Factual","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D019295","descriptor_name":"Computational Biology","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D019295","descriptor_name":"Computational Biology","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D019295","descriptor_name":"Computational Biology","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D055808","descriptor_name":"Drug Discovery","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D055808","descriptor_name":"Drug Discovery","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D055808","descriptor_name":"Drug Discovery","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true}],"locations_count":2,"locations":[{"id":"doi:10.1142/s0219720025500064","is_oa":false,"landing_page_url":"https://doi.org/10.1142/s0219720025500064","pdf_url":null,"source":{"id":"https://openalex.org/S155349577","display_name":"Journal of Bioinformatics and Computational Biology","issn_l":"0219-7200","issn":["0219-7200","1757-6334"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311754","host_organization_name":"Imperial College Press","host_organization_lineage":["https://openalex.org/P4310311754"],"host_organization_lineage_names":["Imperial College Press"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Bioinformatics and Computational Biology","raw_type":"journal-article"},{"id":"pmid:40494666","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40494666","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of bioinformatics and computational biology","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2340890655","display_name":null,"funder_award_id":"62172300","funder_id":"https://openalex.org/F4320315254","funder_display_name":"Innovative Research Group Project of the National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320315254","display_name":"Innovative Research Group Project of the National Natural Science Foundation of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":39,"referenced_works":["https://openalex.org/W1500036797","https://openalex.org/W2022476850","https://openalex.org/W2043509228","https://openalex.org/W2060531713","https://openalex.org/W2080635178","https://openalex.org/W2091159078","https://openalex.org/W2096541451","https://openalex.org/W2114704115","https://openalex.org/W2169678694","https://openalex.org/W2176516200","https://openalex.org/W2177317049","https://openalex.org/W2206840988","https://openalex.org/W2546948247","https://openalex.org/W2558217333","https://openalex.org/W2594183968","https://openalex.org/W2912924812","https://openalex.org/W3034806393","https://openalex.org/W3036527662","https://openalex.org/W3087318293","https://openalex.org/W3099414221","https://openalex.org/W3116865743","https://openalex.org/W3209056694","https://openalex.org/W3217546525","https://openalex.org/W3217681006","https://openalex.org/W4212837331","https://openalex.org/W4281871249","https://openalex.org/W4285294723","https://openalex.org/W4297179162","https://openalex.org/W4317702955","https://openalex.org/W4321254242","https://openalex.org/W4366824201","https://openalex.org/W4389888290","https://openalex.org/W4391345293","https://openalex.org/W4391542588","https://openalex.org/W4393936599","https://openalex.org/W4396216328","https://openalex.org/W4401041686","https://openalex.org/W4406030356","https://openalex.org/W4406537581"],"related_works":["https://openalex.org/W2480711475","https://openalex.org/W2379392295","https://openalex.org/W3160965418","https://openalex.org/W613940353","https://openalex.org/W2320915480","https://openalex.org/W2013003323","https://openalex.org/W1981076832","https://openalex.org/W2024773645","https://openalex.org/W2001717001","https://openalex.org/W2043147644"],"abstract_inverted_index":{"This":[0,78],"paper":[1],"introduces":[2],"M<sup>3</sup>-20M,":[3],"a":[4,103],"large-scale":[5],"<i>Multi-Modal":[6],"Molecule</i>":[7],"dataset":[8,79,197],"that":[9,58,148],"contains":[10],"over":[11],"<i>20":[12],"million</i>":[13],"molecules,":[14],"with":[15],"the":[16,45,50,62,111,161,183],"data":[17],"mainly":[18],"being":[19],"integrated":[20],"from":[21],"existing":[22,52,178],"databases":[23],"and":[24,37,75,91,98,118,130,142,167,171,185,194],"partially":[25],"generated":[26,99],"by":[27],"large":[28,69,135],"language":[29,70,136],"models.":[30],"Designed":[31],"to":[32,163],"support":[33],"AI-driven":[34,191],"drug":[35,73,116,192],"design":[36,74,117,193],"discovery,":[38,119],"M<sup>3</sup>-20M":[39,114,149,188],"is":[40,198],"71":[41],"times":[42],"more":[43,165],"in":[44,115,155,189],"number":[46],"of":[47,66,106,113,187],"molecules":[48],"than":[49,177],"largest":[51],"dataset,":[53],"providing":[54],"an":[55],"unprecedented":[56],"scale":[57],"can":[59,150],"highly":[60],"benefit":[61],"training":[63],"or":[64],"fine-tuning":[65],"models,":[67],"including":[68,138],"models":[71,137,162],"for":[72],"discovery":[76],"tasks.":[77,157],"integrates":[80],"one-dimensional":[81],"SMILES,":[82],"two-dimensional":[83],"molecular":[84,87,131,169],"graphs,":[85],"three-dimensional":[86],"structures,":[88],"physicochemical":[89],"properties,":[90],"textual":[92],"descriptions":[93],"collected":[94],"through":[95],"web":[96],"crawling":[97],"using":[100,134],"GPT-3.5,":[101,140],"offering":[102],"comprehensive":[104],"view":[105],"each":[107],"molecule.":[108],"To":[109],"demonstrate":[110],"power":[112],"we":[120],"conduct":[121],"extensive":[122],"experiments":[123],"on":[124],"two":[125],"key":[126],"tasks:":[127],"molecule":[128],"generation":[129],"property":[132,174],"prediction,":[133],"GLM4,":[139],"GPT-4,":[141],"Llama3-8b.":[143],"Our":[144],"experimental":[145],"results":[146],"show":[147],"significantly":[151],"boost":[152],"model":[153],"performance":[154],"both":[156],"Specifically,":[158],"it":[159],"enables":[160],"generate":[164],"diverse":[166],"valid":[168],"structures":[170],"achieve":[172],"higher":[173],"prediction":[175],"accuracy":[176],"single-modal":[179],"datasets,":[180],"which":[181],"validates":[182],"value":[184],"potential":[186],"supporting":[190],"discovery.":[195],"The":[196],"available":[199],"at":[200],"https://github.com/bz99bz/M-3.":[201]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
