{"id":"https://openalex.org/W2912067075","doi":"https://doi.org/10.1109/bigdata.2018.8622114","title":"CiteSeerX-2018: A Cleansed Multidisciplinary Scholarly Big Dataset","display_name":"CiteSeerX-2018: A Cleansed Multidisciplinary Scholarly Big Dataset","publication_year":2018,"publication_date":"2018-12-01","ids":{"openalex":"https://openalex.org/W2912067075","doi":"https://doi.org/10.1109/bigdata.2018.8622114","mag":"2912067075"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata.2018.8622114","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata.2018.8622114","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5075242841","display_name":"Jian Wu","orcid":"https://orcid.org/0000-0003-0173-4463"},"institutions":[{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jian Wu","raw_affiliation_strings":["Computer Science, Old Dominion University, Norfolk, VA, USA"],"affiliations":[{"raw_affiliation_string":"Computer Science, Old Dominion University, Norfolk, VA, USA","institution_ids":["https://openalex.org/I81365321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062055966","display_name":"Bharath Kandimalla","orcid":null},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bharath Kandimalla","raw_affiliation_strings":["Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA","[Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA]"],"affiliations":[{"raw_affiliation_string":"Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA","institution_ids":["https://openalex.org/I130769515"]},{"raw_affiliation_string":"[Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA]","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045452927","display_name":"Shaurya Rohatgi","orcid":"https://orcid.org/0000-0001-7426-4576"},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shaurya Rohatgi","raw_affiliation_strings":["Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA","[Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA]"],"affiliations":[{"raw_affiliation_string":"Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA","institution_ids":["https://openalex.org/I130769515"]},{"raw_affiliation_string":"[Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA]","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072607871","display_name":"Athar Sefid","orcid":null},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Athar Sefid","raw_affiliation_strings":["Computer Science and Engineering, Pennsylvania State University, University Park, PA, USA"],"affiliations":[{"raw_affiliation_string":"Computer Science and Engineering, Pennsylvania State University, University Park, PA, USA","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005782759","display_name":"Jianyu Mao","orcid":"https://orcid.org/0000-0002-8752-9403"},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jianyu Mao","raw_affiliation_strings":["Computer Science and Engineering, Pennsylvania State University, University Park, PA, USA"],"affiliations":[{"raw_affiliation_string":"Computer Science and Engineering, Pennsylvania State University, University Park, PA, USA","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001294898","display_name":"C. Lee Giles","orcid":"https://orcid.org/0000-0002-1931-585X"},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"C. Lee Giles","raw_affiliation_strings":["Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA","[Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA]"],"affiliations":[{"raw_affiliation_string":"Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA","institution_ids":["https://openalex.org/I130769515"]},{"raw_affiliation_string":"[Information Sciences and Technology, Pennsylvania State University, University Park, PA, USA]","institution_ids":["https://openalex.org/I130769515"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5075242841"],"corresponding_institution_ids":["https://openalex.org/I81365321"],"apc_list":null,"apc_paid":null,"fwci":0.9773,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.8266175,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"5465","last_page":"5467"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9793999791145325,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9793999791145325,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9585000276565552,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9577000141143799,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/multidisciplinary-approach","display_name":"Multidisciplinary approach","score":0.8464003801345825},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5449210405349731},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.36608368158340454},{"id":"https://openalex.org/keywords/sociology","display_name":"Sociology","score":0.10932418704032898},{"id":"https://openalex.org/keywords/social-science","display_name":"Social science","score":0.0884065330028534}],"concepts":[{"id":"https://openalex.org/C22467394","wikidata":"https://www.wikidata.org/wiki/Q849359","display_name":"Multidisciplinary approach","level":2,"score":0.8464003801345825},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5449210405349731},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.36608368158340454},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.10932418704032898},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0884065330028534}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/bigdata.2018.8622114","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata.2018.8622114","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W2085030399","https://openalex.org/W2140479099","https://openalex.org/W2168065722","https://openalex.org/W2253675773","https://openalex.org/W2295508865","https://openalex.org/W2494261950","https://openalex.org/W2964694902","https://openalex.org/W4253723135","https://openalex.org/W4297751839"],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W4286508873","https://openalex.org/W2185045523","https://openalex.org/W2175024588","https://openalex.org/W156969523","https://openalex.org/W184430638","https://openalex.org/W1982406023","https://openalex.org/W2007640890","https://openalex.org/W3031598356","https://openalex.org/W2729605695"],"abstract_inverted_index":{"We":[0,21,51],"report":[1],"the":[2,73],"preliminary":[3],"work":[4],"on":[5],"cleansing":[6],"and":[7,39,57,85],"classifying":[8],"a":[9],"scholarly":[10],"big":[11],"dataset":[12,76],"containing":[13,80],"10+":[14],"million":[15],"academic":[16],"documents":[17],"released":[18],"by":[19],"CiteSeerX.":[20],"design":[22],"novel":[23],"approaches":[24],"to":[25,31,61],"match":[26],"paper":[27],"entities":[28],"in":[29,42],"CiteSeerX":[30,75],"reference":[32],"datasets,":[33],"including":[34],"DBLP,":[35],"Web":[36],"of":[37],"Science,":[38],"Medline,":[40],"resulting":[41],"4.2M":[43],"unique":[44],"matches,":[45],"whose":[46],"metadata":[47],"can":[48],"be":[49],"cleansed.":[50],"also":[52],"investigate":[53],"traditional":[54],"machine":[55],"learning":[56],"neural":[58],"network":[59],"methods":[60],"classify":[62],"abstracts":[63],"into":[64],"6":[65],"subject":[66],"categories.":[67],"The":[68],"classification":[69],"results":[70],"reveal":[71],"that":[72],"current":[74],"is":[77],"highly":[78],"multidisciplinary,":[79],"papers":[81],"well":[82],"beyond":[83],"computer":[84],"information":[86],"sciences.":[87]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
