{"id":"https://openalex.org/W4312491933","doi":"https://doi.org/10.1145/3558100.3563850","title":"Scholarly big data quality assessment","display_name":"Scholarly big data quality assessment","publication_year":2022,"publication_date":"2022-09-20","ids":{"openalex":"https://openalex.org/W4312491933","doi":"https://doi.org/10.1145/3558100.3563850"},"language":"en","primary_location":{"id":"doi:10.1145/3558100.3563850","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3558100.3563850","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3558100.3563850","source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 22nd ACM Symposium on Document Engineering","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3558100.3563850","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5075242841","display_name":"Jian Wu","orcid":"https://orcid.org/0000-0003-0173-4463"},"institutions":[{"id":"https://openalex.org/I4210092569","display_name":"Dominion University College","ror":"https://ror.org/003kqe171","country_code":"GH","type":"education","lineage":["https://openalex.org/I4210092569"]},{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]}],"countries":["GH","US"],"is_corresponding":true,"raw_author_name":"Jian Wu","raw_affiliation_strings":["Old Dominion University"],"affiliations":[{"raw_affiliation_string":"Old Dominion University","institution_ids":["https://openalex.org/I4210092569","https://openalex.org/I81365321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088754329","display_name":"Ryan Hiltabrand","orcid":null},"institutions":[{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]},{"id":"https://openalex.org/I4210092569","display_name":"Dominion University College","ror":"https://ror.org/003kqe171","country_code":"GH","type":"education","lineage":["https://openalex.org/I4210092569"]}],"countries":["GH","US"],"is_corresponding":false,"raw_author_name":"Ryan Hiltabrand","raw_affiliation_strings":["Old Dominion University"],"affiliations":[{"raw_affiliation_string":"Old Dominion University","institution_ids":["https://openalex.org/I4210092569","https://openalex.org/I81365321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015568562","display_name":"Dominik So\u00f3s","orcid":"https://orcid.org/0000-0002-7089-6354"},"institutions":[{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]},{"id":"https://openalex.org/I4210092569","display_name":"Dominion University College","ror":"https://ror.org/003kqe171","country_code":"GH","type":"education","lineage":["https://openalex.org/I4210092569"]}],"countries":["GH","US"],"is_corresponding":false,"raw_author_name":"Dominik So\u00f3s","raw_affiliation_strings":["Old Dominion University"],"affiliations":[{"raw_affiliation_string":"Old Dominion University","institution_ids":["https://openalex.org/I4210092569","https://openalex.org/I81365321"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001294898","display_name":"C. Lee Giles","orcid":"https://orcid.org/0000-0002-1931-585X"},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"C. Lee Giles","raw_affiliation_strings":["Pennsylvania State University"],"affiliations":[{"raw_affiliation_string":"Pennsylvania State University","institution_ids":["https://openalex.org/I130769515"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5075242841"],"corresponding_institution_ids":["https://openalex.org/I4210092569","https://openalex.org/I81365321"],"apc_list":null,"apc_paid":null,"fwci":0.4164,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.69401184,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"4"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.982200026512146,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/conflation","display_name":"Conflation","score":0.873031497001648},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8524994254112244},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.8203141689300537},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6709237098693848},{"id":"https://openalex.org/keywords/ground-truth","display_name":"Ground truth","score":0.5754390954971313},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4951578676700592},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.48971298336982727},{"id":"https://openalex.org/keywords/hash-function","display_name":"Hash function","score":0.4735794961452484},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.46331334114074707},{"id":"https://openalex.org/keywords/citation","display_name":"Citation","score":0.45515745878219604},{"id":"https://openalex.org/keywords/metadata-repository","display_name":"Metadata repository","score":0.43802985548973083},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.3683356046676636},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3368666470050812},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.28393298387527466},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.24839279055595398},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2356186807155609}],"concepts":[{"id":"https://openalex.org/C130440534","wikidata":"https://www.wikidata.org/wiki/Q14946528","display_name":"Conflation","level":2,"score":0.873031497001648},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8524994254112244},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.8203141689300537},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6709237098693848},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.5754390954971313},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4951578676700592},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.48971298336982727},{"id":"https://openalex.org/C99138194","wikidata":"https://www.wikidata.org/wiki/Q183427","display_name":"Hash function","level":2,"score":0.4735794961452484},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.46331334114074707},{"id":"https://openalex.org/C2778805511","wikidata":"https://www.wikidata.org/wiki/Q1713","display_name":"Citation","level":2,"score":0.45515745878219604},{"id":"https://openalex.org/C153048206","wikidata":"https://www.wikidata.org/wiki/Q3454922","display_name":"Metadata repository","level":3,"score":0.43802985548973083},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.3683356046676636},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3368666470050812},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.28393298387527466},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.24839279055595398},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2356186807155609},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3558100.3563850","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3558100.3563850","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3558100.3563850","source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 22nd ACM Symposium on Document Engineering","raw_type":"proceedings-article"},{"id":"pmh:oai:digitalcommons.odu.edu:computerscience_fac_pubs-1230","is_oa":true,"landing_page_url":"https://digitalcommons.odu.edu/computerscience_fac_pubs/226","pdf_url":null,"source":{"id":"https://openalex.org/S4377196314","display_name":"ODU Digital Commons (Old Dominion University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I81365321","host_organization_name":"Old Dominion University","host_organization_lineage":["https://openalex.org/I81365321"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Computer Science Faculty Publications","raw_type":"conference"}],"best_oa_location":{"id":"doi:10.1145/3558100.3563850","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3558100.3563850","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3558100.3563850","source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 22nd ACM Symposium on Document Engineering","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3911204809","display_name":null,"funder_award_id":"1823288","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G848032724","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4312491933.pdf","grobid_xml":"https://content.openalex.org/works/W4312491933.grobid-xml"},"referenced_works_count":16,"referenced_works":["https://openalex.org/W1603719052","https://openalex.org/W1736726159","https://openalex.org/W2038319167","https://openalex.org/W2046977065","https://openalex.org/W2093692255","https://openalex.org/W2168190036","https://openalex.org/W2213054775","https://openalex.org/W2570427954","https://openalex.org/W2789337509","https://openalex.org/W2945941062","https://openalex.org/W3002924435","https://openalex.org/W3015453090","https://openalex.org/W3099977667","https://openalex.org/W3102611879","https://openalex.org/W3120655260","https://openalex.org/W6674169945"],"related_works":["https://openalex.org/W1552553528","https://openalex.org/W2183628870","https://openalex.org/W2782431616","https://openalex.org/W3023161639","https://openalex.org/W2008531296","https://openalex.org/W2379265733","https://openalex.org/W2394393789","https://openalex.org/W2374379029","https://openalex.org/W1503116306","https://openalex.org/W4299935056"],"abstract_inverted_index":{"Recently,":[0],"the":[1,8,17,61,67,72,84,102,134,147],"Allen":[2],"Institute":[3],"for":[4,71],"Artificial":[5],"Intelligence":[6],"released":[7],"Semantic":[9],"Scholar":[10],"Open":[11],"Research":[12],"Corpus":[13],"(S2ORC),":[14],"one":[15],"of":[16,36,93,121,152],"largest":[18],"open-access":[19],"scholarly":[20,28],"big":[21],"datasets":[22],"with":[23,91],"more":[24],"than":[25],"130":[26],"million":[27],"paper":[29],"records.":[30],"S2ORC":[31,73],"contains":[32],"a":[33,161],"significant":[34],"portion":[35],"automatically":[37],"generated":[38],"metadata.":[39],"The":[40,110,140],"metadata":[41],"quality":[42,64,88,104],"could":[43],"impact":[44],"downstream":[45],"tasks":[46],"such":[47],"as":[48],"citation":[49,51],"analysis,":[50],"prediction,":[52],"and":[53,65,154,160,167],"link":[54],"analysis.":[55],"In":[56],"this":[57],"project,":[58],"we":[59,81],"assess":[60],"document":[62,68,86,111],"linking":[63,87,96,103],"estimate":[66],"conflation":[69,112],"rate":[70,113],"dataset.":[74],"Using":[75],"semi-automatically":[76],"curated":[77],"ground":[78,135],"truth":[79,136],"corpora,":[80],"estimated":[82],"that":[83,118,143],"overall":[85],"is":[89,114],"high,":[90],"92.6%":[92],"documents":[94,122],"correctly":[95],"to":[97],"six":[98],"major":[99],"databases,":[100],"but":[101],"varies":[105],"depending":[106],"on":[107],"subject":[108],"domains.":[109],"around":[115],"2.6%,":[116],"meaning":[117],"about":[119],"97.4%":[120],"are":[123,169],"unique.":[124],"We":[125],"further":[126],"quantitatively":[127],"compared":[128],"three":[129],"near-duplicate":[130],"detection":[131],"methods":[132],"using":[133],"created":[137],"from":[138],"S2ORC.":[139],"experiments":[141],"indicated":[142],"locality-sensitive":[144],"hashing":[145],"was":[146],"best":[148],"method":[149],"in":[150],"terms":[151],"effectiveness":[153],"scalability,":[155],"achieving":[156],"high":[157],"performance":[158],"(F1=0.960)":[159],"much":[162],"reduced":[163],"runtime.":[164],"Our":[165],"code":[166],"data":[168],"available":[170],"at":[171],"https://github.com/lamps-lab/docconflation.":[172]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2023,"cited_by_count":1}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
