{"id":"https://openalex.org/W4400522920","doi":"https://doi.org/10.1162/dint_a_00255","title":"FAIR Enough: Develop and Assess a FAIR-Compliant Dataset for Large Language Model Training?","display_name":"FAIR Enough: Develop and Assess a FAIR-Compliant Dataset for Large Language Model Training?","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4400522920","doi":"https://doi.org/10.1162/dint_a_00255"},"language":"en","primary_location":{"id":"doi:10.1162/dint_a_00255","is_oa":true,"landing_page_url":"https://doi.org/10.1162/dint_a_00255","pdf_url":"https://direct.mit.edu/dint/article-pdf/6/2/559/2458950/dint_a_00255.pdf","source":{"id":"https://openalex.org/S4210186383","display_name":"Data Intelligence","issn_l":"2096-7004","issn":["2096-7004","2641-435X"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310315718","host_organization_name":"The MIT Press","host_organization_lineage":["https://openalex.org/P4310315718"],"host_organization_lineage_names":["The MIT Press"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Data Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://direct.mit.edu/dint/article-pdf/6/2/559/2458950/dint_a_00255.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5053939840","display_name":"Shaina Raza","orcid":"https://orcid.org/0000-0003-1061-5845"},"institutions":[{"id":"https://openalex.org/I4210127509","display_name":"Vector Institute","ror":"https://ror.org/03kqdja62","country_code":"CA","type":"facility","lineage":["https://openalex.org/I4210127509"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Shaina Raza","raw_affiliation_strings":["Vector Institute for Artificial Intelligence, Toronto, Ontario, Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Vector Institute for Artificial Intelligence, Toronto, Ontario, Canada","institution_ids":["https://openalex.org/I4210127509"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092998387","display_name":"Shardul Ghuge","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127509","display_name":"Vector Institute","ror":"https://ror.org/03kqdja62","country_code":"CA","type":"facility","lineage":["https://openalex.org/I4210127509"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Shardul Ghuge","raw_affiliation_strings":["Vector Institute for Artificial Intelligence, Toronto, Ontario, Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Vector Institute for Artificial Intelligence, Toronto, Ontario, Canada","institution_ids":["https://openalex.org/I4210127509"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5105841630","display_name":"Chen Ding","orcid":"https://orcid.org/0000-0003-0013-3439"},"institutions":[{"id":"https://openalex.org/I530967","display_name":"Toronto Metropolitan University","ror":"https://ror.org/05g13zd79","country_code":"CA","type":"education","lineage":["https://openalex.org/I530967"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Chen Ding","raw_affiliation_strings":["Toronto Metropolitan University, Toronto, Ontario, Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Toronto Metropolitan University, Toronto, Ontario, Canada","institution_ids":["https://openalex.org/I530967"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061175035","display_name":"Elham Dolatabadi","orcid":"https://orcid.org/0000-0003-2236-2611"},"institutions":[{"id":"https://openalex.org/I192455969","display_name":"York University","ror":"https://ror.org/05fq50484","country_code":"CA","type":"education","lineage":["https://openalex.org/I192455969"]},{"id":"https://openalex.org/I4210127509","display_name":"Vector Institute","ror":"https://ror.org/03kqdja62","country_code":"CA","type":"facility","lineage":["https://openalex.org/I4210127509"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Elham Dolatabadi","raw_affiliation_strings":["Vector Institute for Artificial Intelligence, Toronto, Ontario, Canada","York University, Toronto, Ontario, Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Vector Institute for Artificial Intelligence, Toronto, Ontario, Canada","institution_ids":["https://openalex.org/I4210127509"]},{"raw_affiliation_string":"York University, Toronto, Ontario, Canada","institution_ids":["https://openalex.org/I192455969"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5108445535","display_name":"Deval Pandya","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127509","display_name":"Vector Institute","ror":"https://ror.org/03kqdja62","country_code":"CA","type":"facility","lineage":["https://openalex.org/I4210127509"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Deval Pandya","raw_affiliation_strings":["Vector Institute for Artificial Intelligence, Toronto, Ontario, Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Vector Institute for Artificial Intelligence, Toronto, Ontario, Canada","institution_ids":["https://openalex.org/I4210127509"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5053939840"],"corresponding_institution_ids":["https://openalex.org/I4210127509"],"apc_list":null,"apc_paid":null,"fwci":13.2004,"has_fulltext":true,"cited_by_count":18,"citation_normalized_percentile":{"value":0.98676676,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"6","issue":"2","first_page":"559","last_page":"585"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11937","display_name":"Research Data Management Practices","score":0.9848999977111816,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11937","display_name":"Research Data Management Practices","score":0.9848999977111816,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9829000234603882,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.9786999821662903,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stewardship","display_name":"Stewardship (theology)","score":0.6692160367965698},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5721651315689087},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.552646279335022},{"id":"https://openalex.org/keywords/interoperability","display_name":"Interoperability","score":0.5367274284362793},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5130171179771423},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.506138265132904},{"id":"https://openalex.org/keywords/work","display_name":"Work (physics)","score":0.47797390818595886},{"id":"https://openalex.org/keywords/engineering-ethics","display_name":"Engineering ethics","score":0.4711557626724243},{"id":"https://openalex.org/keywords/knowledge-management","display_name":"Knowledge management","score":0.4621601104736328},{"id":"https://openalex.org/keywords/process-management","display_name":"Process management","score":0.44318100810050964},{"id":"https://openalex.org/keywords/accreditation","display_name":"Accreditation","score":0.43182775378227234},{"id":"https://openalex.org/keywords/checklist","display_name":"Checklist","score":0.4283354580402374},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.4264277219772339},{"id":"https://openalex.org/keywords/risk-analysis","display_name":"Risk analysis (engineering)","score":0.32276880741119385},{"id":"https://openalex.org/keywords/political-science","display_name":"Political science","score":0.18662360310554504},{"id":"https://openalex.org/keywords/business","display_name":"Business","score":0.1773104965686798},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.14670217037200928},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.14626401662826538},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.11743885278701782},{"id":"https://openalex.org/keywords/medical-education","display_name":"Medical education","score":0.10814180970191956},{"id":"https://openalex.org/keywords/medicine","display_name":"Medicine","score":0.1001412570476532}],"concepts":[{"id":"https://openalex.org/C2777950569","wikidata":"https://www.wikidata.org/wiki/Q17021836","display_name":"Stewardship (theology)","level":3,"score":0.6692160367965698},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5721651315689087},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.552646279335022},{"id":"https://openalex.org/C20136886","wikidata":"https://www.wikidata.org/wiki/Q749647","display_name":"Interoperability","level":2,"score":0.5367274284362793},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5130171179771423},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.506138265132904},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.47797390818595886},{"id":"https://openalex.org/C55587333","wikidata":"https://www.wikidata.org/wiki/Q1133029","display_name":"Engineering ethics","level":1,"score":0.4711557626724243},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.4621601104736328},{"id":"https://openalex.org/C195094911","wikidata":"https://www.wikidata.org/wiki/Q14167904","display_name":"Process management","level":1,"score":0.44318100810050964},{"id":"https://openalex.org/C61521584","wikidata":"https://www.wikidata.org/wiki/Q705899","display_name":"Accreditation","level":2,"score":0.43182775378227234},{"id":"https://openalex.org/C2779356329","wikidata":"https://www.wikidata.org/wiki/Q922625","display_name":"Checklist","level":2,"score":0.4283354580402374},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.4264277219772339},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.32276880741119385},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.18662360310554504},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.1773104965686798},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.14670217037200928},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.14626401662826538},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.11743885278701782},{"id":"https://openalex.org/C509550671","wikidata":"https://www.wikidata.org/wiki/Q126945","display_name":"Medical education","level":1,"score":0.10814180970191956},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.1001412570476532},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C78519656","wikidata":"https://www.wikidata.org/wiki/Q101333","display_name":"Mechanical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1162/dint_a_00255","is_oa":true,"landing_page_url":"https://doi.org/10.1162/dint_a_00255","pdf_url":"https://direct.mit.edu/dint/article-pdf/6/2/559/2458950/dint_a_00255.pdf","source":{"id":"https://openalex.org/S4210186383","display_name":"Data Intelligence","issn_l":"2096-7004","issn":["2096-7004","2641-435X"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310315718","host_organization_name":"The MIT Press","host_organization_lineage":["https://openalex.org/P4310315718"],"host_organization_lineage_names":["The MIT Press"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Data Intelligence","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:32a30618e59d4dc9b7125fc4c55a5246","is_oa":false,"landing_page_url":"https://doaj.org/article/32a30618e59d4dc9b7125fc4c55a5246","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Data Intelligence, Vol 6, Iss 2 (2024)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1162/dint_a_00255","is_oa":true,"landing_page_url":"https://doi.org/10.1162/dint_a_00255","pdf_url":"https://direct.mit.edu/dint/article-pdf/6/2/559/2458950/dint_a_00255.pdf","source":{"id":"https://openalex.org/S4210186383","display_name":"Data Intelligence","issn_l":"2096-7004","issn":["2096-7004","2641-435X"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310315718","host_organization_name":"The MIT Press","host_organization_lineage":["https://openalex.org/P4310315718"],"host_organization_lineage_names":["The MIT Press"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Data Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320309949","display_name":"Canadian Institute for Advanced Research","ror":"https://ror.org/01sdtdd95"},{"id":"https://openalex.org/F4320319880","display_name":"Government of Canada","ror":"https://ror.org/010q4q527"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4400522920.pdf","grobid_xml":"https://content.openalex.org/works/W4400522920.grobid-xml"},"referenced_works_count":73,"referenced_works":["https://openalex.org/W2095899184","https://openalex.org/W2302501749","https://openalex.org/W2718669472","https://openalex.org/W2783839275","https://openalex.org/W2792488055","https://openalex.org/W2804142383","https://openalex.org/W2896558853","https://openalex.org/W2911501529","https://openalex.org/W2912011423","https://openalex.org/W2952037138","https://openalex.org/W2953522645","https://openalex.org/W2954907832","https://openalex.org/W2963078909","https://openalex.org/W2986579771","https://openalex.org/W2988093192","https://openalex.org/W2988998130","https://openalex.org/W3004493409","https://openalex.org/W3044438666","https://openalex.org/W3110795479","https://openalex.org/W3133702157","https://openalex.org/W3171896931","https://openalex.org/W3173162544","https://openalex.org/W3177468621","https://openalex.org/W3203110335","https://openalex.org/W3203149535","https://openalex.org/W3206611000","https://openalex.org/W4210299703","https://openalex.org/W4213014074","https://openalex.org/W4232784937","https://openalex.org/W4286850188","https://openalex.org/W4288058238","https://openalex.org/W4309674289","https://openalex.org/W4310238579","https://openalex.org/W4312611720","https://openalex.org/W4318473069","https://openalex.org/W4320351055","https://openalex.org/W4323043727","https://openalex.org/W4362515116","https://openalex.org/W4365804479","https://openalex.org/W4366816122","https://openalex.org/W4378472501","https://openalex.org/W4381891405","https://openalex.org/W4382631260","https://openalex.org/W4384560559","https://openalex.org/W4384662964","https://openalex.org/W4384918448","https://openalex.org/W4385190659","https://openalex.org/W4385262268","https://openalex.org/W4385276403","https://openalex.org/W4386081001","https://openalex.org/W4386231337","https://openalex.org/W4386783354","https://openalex.org/W4386794445","https://openalex.org/W4386836871","https://openalex.org/W4388521769","https://openalex.org/W4388778142","https://openalex.org/W4389165057","https://openalex.org/W4389279010","https://openalex.org/W4390048758","https://openalex.org/W4390490761","https://openalex.org/W4391136507","https://openalex.org/W6753969151","https://openalex.org/W6755300836","https://openalex.org/W6758352718","https://openalex.org/W6760605923","https://openalex.org/W6773323267","https://openalex.org/W6776644801","https://openalex.org/W6780994845","https://openalex.org/W6801913153","https://openalex.org/W6808989385","https://openalex.org/W6850440376","https://openalex.org/W6851792948","https://openalex.org/W7047884249"],"related_works":["https://openalex.org/W4234875088","https://openalex.org/W2007960770","https://openalex.org/W2476517534","https://openalex.org/W2013796470","https://openalex.org/W2183187020","https://openalex.org/W2371811190","https://openalex.org/W2509459056","https://openalex.org/W1971595880","https://openalex.org/W4385407997","https://openalex.org/W2252873749"],"abstract_inverted_index":{"ABSTRACT":[0],"The":[1,133],"rapid":[2],"evolution":[3],"of":[4,25,48,62,70,77,106,112,137,173],"Large":[5],"Language":[6],"Models":[7],"(LLMs)":[8],"highlights":[9],"the":[10,23,46,60,75,100,110,129,164,171],"necessity":[11],"for":[12,38,83],"ethical":[13,39],"considerations":[14],"and":[15,120,135,154,178],"data":[16,31,40,51,82,125],"integrity":[17],"in":[18,45,80,122,157],"AI":[19,181],"development,":[20],"particularly":[21],"emphasizing":[22],"role":[24],"FAIR":[26,78,97,124],"(Findable,":[27],"Accessible,":[28],"Interoperable,":[29],"Reusable)":[30],"principles.":[32],"While":[33],"these":[34],"principles":[35,79,98,126],"are":[36,140],"crucial":[37],"stewardship,":[41],"their":[42],"specific":[43],"application":[44],"context":[47],"LLM":[49,84,101],"training":[50],"remains":[52],"an":[53,68],"under-explored":[54],"area.":[55],"This":[56],"research":[57],"gap":[58],"is":[59,109],"focus":[61],"our":[63,107,138],"study,":[64],"which":[65],"begins":[66],"with":[67],"examination":[69],"existing":[71],"literature":[72],"to":[73,95,117,163,169],"underline":[74],"importance":[76],"managing":[81],"training.":[85],"Building":[86],"upon":[87],"this,":[88],"we":[89],"propose":[90],"a":[91,113,143,148,167],"novel":[92],"frame-work":[93,139],"designed":[94],"integrate":[96],"into":[99],"development":[102,111,131],"lifecycle.":[103],"A":[104],"contribution":[105],"work":[108],"comprehensive":[114],"checklist":[115],"intended":[116],"guide":[118],"researchers":[119],"developers":[121],"applying":[123],"consistently":[127],"across":[128],"model":[130],"process.":[132],"utility":[134],"effectiveness":[136],"validated":[141],"through":[142],"case":[144],"study":[145],"on":[146],"creating":[147],"FAIR-compliant":[149],"dataset":[150],"aimed":[151],"at":[152],"detecting":[153],"mitigating":[155],"biases":[156],"LLMs.":[158],"We":[159],"present":[160],"this":[161],"framework":[162],"community":[165],"as":[166],"tool":[168],"foster":[170],"creation":[172],"technologically":[174],"advanced,":[175],"ethically":[176],"grounded,":[177],"socially":[179],"responsible":[180],"models.":[182]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":12},{"year":2024,"cited_by_count":3}],"updated_date":"2026-05-06T08:25:59.206177","created_date":"2025-10-10T00:00:00"}
