{"id":"https://openalex.org/W4393521160","doi":"https://doi.org/10.5281/zenodo.5746468","title":"PheneBank: Processed Medline Abstracts and PMC full articles + Phenotype-Disease Associations","display_name":"PheneBank: Processed Medline Abstracts and PMC full articles + Phenotype-Disease Associations","publication_year":2020,"publication_date":"2020-07-18","ids":{"openalex":"https://openalex.org/W4393521160","doi":"https://doi.org/10.5281/zenodo.5746468"},"language":"en","primary_location":{"id":"pmh:oai:zenodo.org:5746468","is_oa":true,"landing_page_url":"https://zenodo.org/record/5746468","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"info:eu-repo/semantics/other"},"type":"dataset","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://zenodo.org/record/5746468","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110555562","display_name":"Mohammad Taher Pilehvar","orcid":null},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Pilehvar, Mohammad Taher","raw_affiliation_strings":["University of Cambridge"],"affiliations":[{"raw_affiliation_string":"University of Cambridge","institution_ids":["https://openalex.org/I241749"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112708286","display_name":"Nigel Collier","orcid":null},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Collier, Nigel","raw_affiliation_strings":["University of Cambridge"],"affiliations":[{"raw_affiliation_string":"University of Cambridge","institution_ids":["https://openalex.org/I241749"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080121437","display_name":"Adam S. Bernard","orcid":"https://orcid.org/0000-0003-2699-5168"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bernard, Adam S.","raw_affiliation_strings":["Queen Mary University"],"affiliations":[{"raw_affiliation_string":"Queen Mary University","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009950897","display_name":"Damian Smedley","orcid":"https://orcid.org/0000-0002-5836-9850"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Smedley, Damian","raw_affiliation_strings":["Queen Mary University"],"affiliations":[{"raw_affiliation_string":"Queen Mary University","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5110555562"],"corresponding_institution_ids":["https://openalex.org/I241749"],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13656","display_name":"Science, Research, and Medicine","score":0.2143000066280365,"subfield":{"id":"https://openalex.org/subfields/2743","display_name":"Reproductive Medicine"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T13656","display_name":"Science, Research, and Medicine","score":0.2143000066280365,"subfield":{"id":"https://openalex.org/subfields/2743","display_name":"Reproductive Medicine"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/medline","display_name":"MEDLINE","score":0.557734489440918},{"id":"https://openalex.org/keywords/phenotype","display_name":"Phenotype","score":0.5315302014350891},{"id":"https://openalex.org/keywords/clinical-phenotype","display_name":"Clinical phenotype","score":0.49895572662353516},{"id":"https://openalex.org/keywords/medicine","display_name":"Medicine","score":0.3727653920650482},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3206411898136139},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.31758829951286316},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.24250802397727966},{"id":"https://openalex.org/keywords/genetics","display_name":"Genetics","score":0.15642276406288147},{"id":"https://openalex.org/keywords/gene","display_name":"Gene","score":0.03846439719200134}],"concepts":[{"id":"https://openalex.org/C2779473830","wikidata":"https://www.wikidata.org/wiki/Q1540899","display_name":"MEDLINE","level":2,"score":0.557734489440918},{"id":"https://openalex.org/C127716648","wikidata":"https://www.wikidata.org/wiki/Q104053","display_name":"Phenotype","level":3,"score":0.5315302014350891},{"id":"https://openalex.org/C3020646490","wikidata":"https://www.wikidata.org/wiki/Q25203551","display_name":"Clinical phenotype","level":4,"score":0.49895572662353516},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.3727653920650482},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3206411898136139},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.31758829951286316},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.24250802397727966},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.15642276406288147},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.03846439719200134},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:zenodo.org:5746468","is_oa":true,"landing_page_url":"https://zenodo.org/record/5746468","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"info:eu-repo/semantics/other"},{"id":"doi:10.5281/zenodo.5746468","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.5746468","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"dataset"}],"best_oa_location":{"id":"pmh:oai:zenodo.org:5746468","is_oa":true,"landing_page_url":"https://zenodo.org/record/5746468","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"info:eu-repo/semantics/other"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3164771895","https://openalex.org/W2604464132","https://openalex.org/W2161088626","https://openalex.org/W3029893540","https://openalex.org/W2314682585","https://openalex.org/W2313584493","https://openalex.org/W2135667114","https://openalex.org/W2110968082","https://openalex.org/W3122747281","https://openalex.org/W2966053455"],"abstract_inverted_index":{"<strong>The":[0,260],"PheneBank":[1,114,267],"project:</strong>":[2],"Free":[3],"text":[4,74,508],"scientific":[5],"literature":[6,129],"has":[7,82,366,474],"the":[8,20,50,54,73,102,108,128,168,187,197,223,228,233,250,256,266,271,321,341,347,352,357,427,481,486,503,513,541,547,561,574,584],"potential":[9],"to":[10,92,117,131,144,172,181,307,346,386,566,583,587],"be":[11,380],"an":[12,179,263],"incredibly":[13],"valuable":[14],"source":[15],"of":[16,58,63,89,167,194,212,244,249,265,273,290,392,512],"data":[17,378],"for":[18,52,96,147,382,502,525,573],"uncovering":[19],"often":[21],"hidden":[22],"relationships":[23],"between":[24,152,189],"genes,":[25],"diseases":[26,154],"and":[27,37,43,56,107,119,130,155,191,232,300,315,394,425,431,517,535],"phenotypes.":[28,195],"Phenotypic":[29],"descriptions":[30,48],"cover":[31],"abnormalities":[32],"in":[33,332,396,540],"anatomical":[34],"structures,":[35],"processes":[36],"behaviours.":[38],"For":[39],"example":[40],"'growth":[41],"delay'":[42],"'body":[44],"weight":[45],"loss'.":[46],"Such":[47],"form":[49],"basis":[51],"determining":[53],"existence":[55],"treatment":[57],"a":[59,86,204,370,397,421,523],"disease":[60],"but,":[61],"because":[62],"their":[64,567],"inherent":[65],"complexity,":[66],"have":[67,159,304,488],"previously":[68],"received":[69],"less":[70],"attention":[71],"by":[72,85,185,227],"mining":[75],"community.":[76],"In":[77,317],"recent":[78],"years,":[79],"significant":[80,150],"effort":[81],"been":[83,305,367,489],"spent":[84],"small":[87],"number":[88],"expert":[90],"curators":[91],"create":[93],"coding":[94],"systems":[95],"phenotypes":[97,169,245],"(called":[98],"\"ontologies\"),":[99],"such":[100,136],"as":[101,203,278,280,531],"Human":[103],"Phenotype":[104,110,597,600,610],"Ontology":[105,111],"(HP)":[106],"Mammalian":[109],"(MP).":[112],"The":[113,141,215,302,363,384,434,471],"project":[115,142],"proposes":[116],"support":[118],"speed":[120],"up":[121,255],"curation":[122],"using":[123,246,369],"terms":[124],"discovered":[125],"directly":[126],"from":[127,161,351,356,484,509],"automatically":[132,326],"integrate":[133],"them":[134],"with":[135,287,420,522,546],"standard":[137],"ontologies.":[138],"<br>":[139,140,259,361,401,571],"seeks":[143],"harness":[145],"texts":[146],"extracting":[148],"statistically":[149],"associations":[151,323,354],"phenotypes,":[153],"genes.":[156],"Earlier":[157],"approaches":[158],"suffered":[160],"not":[162],"providing":[163],"deep":[164,175],"semantic":[165,390,399],"representations":[166],"they":[170],"tried":[171],"target.":[173],"Our":[174],"learning-based":[176],"approach":[177,216],"is":[178,418,436],"attempt":[180],"overcome":[182],"this":[183],"issue":[184],"reducing":[186],"uncertainty":[188],"textual":[190],"ontological":[192],"forms":[193],"Specifically,":[196],"model":[198,373],"treats":[199],"multitoken":[200],"named":[201,419,479],"entities":[202,303,395],"single":[205],"token":[206],"which":[207,477,485],"allows":[208],"more":[209],"reliable":[210],"handling":[211],"multiword":[213],"expressions.":[214],"builds":[217],"on":[218,329,376,389,443],"ground":[219],"breaking":[220],"research":[221],"at":[222],"European":[224],"Bininformatics":[225],"Institute":[226],"PI":[229],"(Nigel":[230],"Collier)":[231],"Co-investigator":[234],"(Damian":[235],"Smedley,":[236],"Queen":[237],"Mary":[238],"University":[239],"London),":[240],"including":[241],"terminology":[242],"alignment":[243],"pairwise":[247],"scoring":[248],"conceptual":[251],"elements":[252],"that":[253,324,409,528,560,581],"make":[254],"phenotype.":[257],"http://www.phenebank.org":[258],"dataset:</strong>":[261],"As":[262,572],"output":[264],"project,":[268],"we":[269,319,339],"release":[270,320],"set":[272],"24":[274],"million":[275],"MEDLINE":[276],"abstracts":[277],"well":[279],"3.8M":[281],"open-access":[282],"PMC":[283],"full":[284],"articles":[285,487],"annotated":[286],"9":[288],"classes":[289],"entity:":[291],"Phenotype,":[292],"Disease,":[293],"Anatomy,":[294],"Cell,":[295],"Cell_line,":[296],"GPR,":[297],"Gene_variant,":[298],"Molecule,":[299],"Pathway.":[301],"mapped":[306],"five":[308],"major":[309],"ontologies:":[310],"SNOMED,":[311],"HPO,":[312],"MeSH,":[313],"PRO,":[314],"FMA.":[316],"addition,":[318],"phenotype-disease":[322],"are":[325,410,478,492,529,538,563,578],"extracted":[327],"based":[328,442],"co-occurrences":[330],"statistics":[331],"Medline":[333],"abstracts.":[334],"Among":[335],"different":[336,510],"statistical":[337],"measures":[338],"evaluated,":[340],"Fisher":[342,613],"test":[343],"best":[344],"corresponded":[345],"known":[348],"tuples":[349],"available":[350,355],"curated":[353],"Monarch":[358],"Initiative":[359],"(https://monarchinitiative.org).":[360],"<strong>Processing:</strong>":[362],"NER":[364],"tagging":[365],"done":[368],"BiLSTM-CRF":[371],"neural":[372],"(https://github.com/pilehvar/phenebank)":[374],"trained":[375],"expert-annotated":[377],"(to":[379],"released":[381],"research).":[383],"grounding":[385],"ontologies":[387],"relies":[388],"embedding":[391],"concepts":[393,562],"unified":[398],"space.":[400],"<strong>Data":[402],"format:</strong>":[403],"<strong>PheneBank_Processed_PubMed.part[x].tar.gz":[404],"</strong>contains":[405],"24,359,010":[406],".txt":[407,416,500],"files":[408,473,494,501],"classified":[411],"into":[412,438],"812":[413],"directories.":[414],"Each":[415,519],"file":[417],"PubMed":[422],"article":[423,497],"ID":[424],"contains":[426],"corresponding":[428],"article's":[429],"abstract":[430],"its":[432],"annotations.":[433],"dataset":[435],"split":[437],"four":[439],"(unequal)":[440],"parts":[441,511],"PubMed's":[444],"structure:<br>":[445],"part1:":[446],"medline16n00*":[447],"medline16n01*":[448],"medline16n02*":[449],"[299":[450],"directories,":[451,457,463,469],"2.8GB]<br>":[452],"part2:":[453],"medline16n03*":[454],"medline16n04*":[455],"[200":[456,462],"4.7GB]<br>":[458],"part3:":[459],"medline16n05*":[460],"medline16n06*":[461],"5.3GB]<br>":[464],"part4:":[465],"medline16n07*":[466],"medline16n08*":[467],"[113":[468],"3.1GB]":[470],"<strong>PheneBank_Processed_PMC.tar.gz</strong>":[472],"6,180":[475],"directories":[476],"after":[480],"journal":[482],"titles":[483],"drawn.":[490],"There":[491],"three":[493],"per":[495],"each":[496],"(i.e.,":[498],"3":[499],"3,751,770":[504],"distinct":[505],"articles),":[506],"containing":[507],"article:":[514],".title.txt,":[515],".abstract.txt,":[516],".body.txt.":[518],"line":[520,543],"starts":[521],"word;":[524],"those":[526],"words":[527],"identified":[530],"entities,":[532],"entity":[533],"type":[534],"mapping":[536,568],"information":[537],"followed":[539],"same":[542],"(tab":[544],"separated),":[545],"following":[548,585],"format:":[549],"word":[550],"&lt;TAB&gt;":[551,553,555],":::":[552],"entity_type":[554],"entity_concept_ID_1##confidence_score_1":[556],"entity_concept_ID_2##confidence_score_2":[557],"...":[558],"Note":[559],"sorted":[564],"according":[565],"confidence":[569],"scores.":[570],"<strong>PheneBank_Associations.tsv</strong>":[575],"file,":[576],"there":[577],"ten":[579],"columns":[580],"correspond":[582],"(left":[586],"right):":[588],"-":[589,592,596,599,603,606,609,612,615,617],"Disease":[590,593,607],"Name<br>":[591,598],"(MONDO)":[594],"ID<br>":[595,602],"(HPO)":[601],"Co-occurrence":[604],"Frequency<br>":[605,608,611],"(log)<br>":[614],"Dice<br>":[616],"Normalized":[618],"PMI":[619]},"counts_by_year":[],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
