{"id":"https://openalex.org/W2041439772","doi":"https://doi.org/10.1045/november14-smith-unna","title":"The ContentMine Scraping Stack: Literature-scale Content Mining with Community-maintained Collections of Declarative Scrapers","display_name":"The ContentMine Scraping Stack: Literature-scale Content Mining with Community-maintained Collections of Declarative Scrapers","publication_year":2014,"publication_date":"2014-11-01","ids":{"openalex":"https://openalex.org/W2041439772","doi":"https://doi.org/10.1045/november14-smith-unna","mag":"2041439772"},"language":"en","primary_location":{"id":"doi:10.1045/november14-smith-unna","is_oa":true,"landing_page_url":"https://doi.org/10.1045/november14-smith-unna","pdf_url":null,"source":{"id":"https://openalex.org/S119508283","display_name":"D-Lib Magazine","issn_l":"1082-9873","issn":["1082-9873"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310316316","host_organization_name":"Corporation for National Research Initiatives","host_organization_lineage":["https://openalex.org/P4310316316"],"host_organization_lineage_names":["Corporation for National Research Initiatives"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"D-Lib Magazine","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1045/november14-smith-unna","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015200581","display_name":"Richard Smith-Unna","orcid":"https://orcid.org/0000-0001-8721-7197"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Richard Smith-Unna","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5041092827","display_name":"Peter Murray\u2010Rust","orcid":"https://orcid.org/0000-0003-3386-3972"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peter Murray-Rust","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5015200581"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.21286059,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":"20","issue":"11/12","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12478","display_name":"Wikis in Education and Collaboration","score":0.9789000153541565,"subfield":{"id":"https://openalex.org/subfields/3315","display_name":"Communication"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12478","display_name":"Wikis in Education and Collaboration","score":0.9789000153541565,"subfield":{"id":"https://openalex.org/subfields/3315","display_name":"Communication"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9679999947547913,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9484000205993652,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scraper-site","display_name":"Scraper site","score":0.8300998210906982},{"id":"https://openalex.org/keywords/stack","display_name":"Stack (abstract data type)","score":0.7255969643592834},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5065244436264038},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.46956801414489746},{"id":"https://openalex.org/keywords/content","display_name":"Content (measure theory)","score":0.42819488048553467},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3228830397129059},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.19977256655693054},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.1889352798461914},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.15356653928756714},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.08163386583328247},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.07282724976539612}],"concepts":[{"id":"https://openalex.org/C78500136","wikidata":"https://www.wikidata.org/wiki/Q477840","display_name":"Scraper site","level":2,"score":0.8300998210906982},{"id":"https://openalex.org/C9395851","wikidata":"https://www.wikidata.org/wiki/Q177929","display_name":"Stack (abstract data type)","level":2,"score":0.7255969643592834},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5065244436264038},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.46956801414489746},{"id":"https://openalex.org/C2778152352","wikidata":"https://www.wikidata.org/wiki/Q5165061","display_name":"Content (measure theory)","level":2,"score":0.42819488048553467},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3228830397129059},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.19977256655693054},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.1889352798461914},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.15356653928756714},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.08163386583328247},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.07282724976539612},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1045/november14-smith-unna","is_oa":true,"landing_page_url":"https://doi.org/10.1045/november14-smith-unna","pdf_url":null,"source":{"id":"https://openalex.org/S119508283","display_name":"D-Lib Magazine","issn_l":"1082-9873","issn":["1082-9873"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310316316","host_organization_name":"Corporation for National Research Initiatives","host_organization_lineage":["https://openalex.org/P4310316316"],"host_organization_lineage_names":["Corporation for National Research Initiatives"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"D-Lib Magazine","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1045/november14-smith-unna","is_oa":true,"landing_page_url":"https://doi.org/10.1045/november14-smith-unna","pdf_url":null,"source":{"id":"https://openalex.org/S119508283","display_name":"D-Lib Magazine","issn_l":"1082-9873","issn":["1082-9873"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310316316","host_organization_name":"Corporation for National Research Initiatives","host_organization_lineage":["https://openalex.org/P4310316316"],"host_organization_lineage_names":["Corporation for National Research Initiatives"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"D-Lib Magazine","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2351413189","https://openalex.org/W2352031027","https://openalex.org/W2362264299","https://openalex.org/W2389718375","https://openalex.org/W2378574380","https://openalex.org/W2357222887","https://openalex.org/W2393563204","https://openalex.org/W2386629175","https://openalex.org/W2372195686","https://openalex.org/W2355008688"],"abstract_inverted_index":{"Successfully":[0],"mining":[1,34],"scholarly":[2,67],"literature":[3,68],"at":[4,35],"scale":[5],"is":[6],"inhibited":[7],"by":[8,19,82],"technical":[9],"and":[10,37,64,75],"political":[11],"barriers":[12],"that":[13,31],"have":[14,29],"been":[15],"only":[16,39],"partially":[17],"addressed":[18],"publishers'":[20],"application":[21],"programming":[22],"interfaces":[23,80],"(APIs).":[24],"Many":[25],"of":[26,70,73,97],"those":[27],"APIs":[28],"restrictions":[30],"inhibit":[32],"data":[33],"scale,":[36],"while":[38],"some":[40],"publishers":[41,47],"actually":[42],"provide":[43],"APIs,":[44],"almost":[45],"all":[46],"make":[48,59],"their":[49],"content":[50],"available":[51],"on":[52],"the":[53,66,71,88,98],"web.":[54],"Current":[55],"web":[56],"technologies":[57],"should":[58],"it":[60],"possible":[61],"to":[62,91],"harvest":[63],"mine":[65],"regardless":[69],"source":[72],"publication,":[74],"without":[76],"using":[77],"specialised":[78],"programmatic":[79],"controlled":[81],"each":[83],"publisher.":[84],"Here":[85],"we":[86],"describe":[87],"tools":[89],"developed":[90],"address":[92],"this":[93],"challenge":[94],"as":[95],"part":[96],"ContentMine":[99],"project.":[100]},"counts_by_year":[{"year":2021,"cited_by_count":1},{"year":2019,"cited_by_count":3},{"year":2018,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
