{"id":"https://openalex.org/W2198549397","doi":"https://doi.org/10.1109/bigdata.2015.7363933","title":"Scaling out for extreme scale corpus data","display_name":"Scaling out for extreme scale corpus data","publication_year":2015,"publication_date":"2015-10-01","ids":{"openalex":"https://openalex.org/W2198549397","doi":"https://doi.org/10.1109/bigdata.2015.7363933","mag":"2198549397"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata.2015.7363933","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata.2015.7363933","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2015 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://eprints.lancs.ac.uk/id/eprint/76744/4/extreme_scale_corpus.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046274952","display_name":"Matthew Coole","orcid":"https://orcid.org/0000-0002-9279-931X"},"institutions":[{"id":"https://openalex.org/I67415387","display_name":"Lancaster University","ror":"https://ror.org/04f2nsd36","country_code":"GB","type":"education","lineage":["https://openalex.org/I67415387"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Matthew Coole","raw_affiliation_strings":["School of Computing and Communications, Lancaster University, Lancaster Lancashire, UK"],"affiliations":[{"raw_affiliation_string":"School of Computing and Communications, Lancaster University, Lancaster Lancashire, UK","institution_ids":["https://openalex.org/I67415387"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058785189","display_name":"Paul Rayson","orcid":"https://orcid.org/0000-0002-1257-2191"},"institutions":[{"id":"https://openalex.org/I67415387","display_name":"Lancaster University","ror":"https://ror.org/04f2nsd36","country_code":"GB","type":"education","lineage":["https://openalex.org/I67415387"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Paul Rayson","raw_affiliation_strings":["School of Computing and Communications, Lancaster University, Lancaster Lancashire, UK"],"affiliations":[{"raw_affiliation_string":"School of Computing and Communications, Lancaster University, Lancaster Lancashire, UK","institution_ids":["https://openalex.org/I67415387"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5039613496","display_name":"John Mariani","orcid":"https://orcid.org/0000-0003-0723-9594"},"institutions":[{"id":"https://openalex.org/I67415387","display_name":"Lancaster University","ror":"https://ror.org/04f2nsd36","country_code":"GB","type":"education","lineage":["https://openalex.org/I67415387"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"John Mariani","raw_affiliation_strings":["School of Computing and Communications, Lancaster University, Lancaster Lancashire, UK"],"affiliations":[{"raw_affiliation_string":"School of Computing and Communications, Lancaster University, Lancaster Lancashire, UK","institution_ids":["https://openalex.org/I67415387"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5046274952"],"corresponding_institution_ids":["https://openalex.org/I67415387"],"apc_list":null,"apc_paid":null,"fwci":0.822,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.8234581,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":"7352","issue":null,"first_page":"1643","last_page":"1649"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7952869534492493},{"id":"https://openalex.org/keywords/zipfs-law","display_name":"Zipf's law","score":0.6837608814239502},{"id":"https://openalex.org/keywords/burstiness","display_name":"Burstiness","score":0.5963475108146667},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.5826790928840637},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5248969793319702},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.4575464129447937},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.44444021582603455},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4312566816806793},{"id":"https://openalex.org/keywords/corpus-linguistics","display_name":"Corpus linguistics","score":0.4264853596687317},{"id":"https://openalex.org/keywords/text-corpus","display_name":"Text corpus","score":0.41107094287872314},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.40242376923561096},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.40226301550865173},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.39313381910324097},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.14287710189819336}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7952869534492493},{"id":"https://openalex.org/C125932096","wikidata":"https://www.wikidata.org/wiki/Q205472","display_name":"Zipf's law","level":2,"score":0.6837608814239502},{"id":"https://openalex.org/C2781023610","wikidata":"https://www.wikidata.org/wiki/Q17006304","display_name":"Burstiness","level":3,"score":0.5963475108146667},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.5826790928840637},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5248969793319702},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.4575464129447937},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.44444021582603455},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4312566816806793},{"id":"https://openalex.org/C532629269","wikidata":"https://www.wikidata.org/wiki/Q865083","display_name":"Corpus linguistics","level":2,"score":0.4264853596687317},{"id":"https://openalex.org/C2474386","wikidata":"https://www.wikidata.org/wiki/Q461183","display_name":"Text corpus","level":2,"score":0.41107094287872314},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40242376923561096},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.40226301550865173},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.39313381910324097},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.14287710189819336},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C158379750","wikidata":"https://www.wikidata.org/wiki/Q214111","display_name":"Network packet","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/bigdata.2015.7363933","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata.2015.7363933","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2015 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},{"id":"pmh:oai:eprints.lancs.ac.uk:76744","is_oa":true,"landing_page_url":null,"pdf_url":"https://eprints.lancs.ac.uk/id/eprint/76744/4/extreme_scale_corpus.pdf","source":{"id":"https://openalex.org/S4306401916","display_name":"Lancaster EPrints (Lancaster University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67415387","host_organization_name":"Lancaster University","host_organization_lineage":["https://openalex.org/I67415387"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"PeerReviewed"}],"best_oa_location":{"id":"pmh:oai:eprints.lancs.ac.uk:76744","is_oa":true,"landing_page_url":null,"pdf_url":"https://eprints.lancs.ac.uk/id/eprint/76744/4/extreme_scale_corpus.pdf","source":{"id":"https://openalex.org/S4306401916","display_name":"Lancaster EPrints (Lancaster University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67415387","host_organization_name":"Lancaster University","host_organization_lineage":["https://openalex.org/I67415387"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"PeerReviewed"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.6600000262260437}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2198549397.pdf","grobid_xml":"https://content.openalex.org/works/W2198549397.grobid-xml"},"referenced_works_count":15,"referenced_works":["https://openalex.org/W1009222391","https://openalex.org/W1574901103","https://openalex.org/W1990508473","https://openalex.org/W2034639910","https://openalex.org/W2060859842","https://openalex.org/W2084568010","https://openalex.org/W2095503871","https://openalex.org/W2109718328","https://openalex.org/W2131243403","https://openalex.org/W2150907923","https://openalex.org/W2155510013","https://openalex.org/W2165131017","https://openalex.org/W2250489604","https://openalex.org/W4241560494","https://openalex.org/W6823338303"],"related_works":["https://openalex.org/W2124056838","https://openalex.org/W2375537499","https://openalex.org/W2516977220","https://openalex.org/W3096124370","https://openalex.org/W1982878818","https://openalex.org/W2794113965","https://openalex.org/W2384280299","https://openalex.org/W2366526038","https://openalex.org/W2183648197","https://openalex.org/W99386505"],"abstract_inverted_index":{"Much":[0],"of":[1,13,33,48,56,97,100,126,130,137,164,194,199,226,239,262],"the":[2,17,31,61,73,94,121,162,191,197,224,234,246],"previous":[3],"work":[4],"in":[5,20,63,67,128,157,167,207,230,249],"Big":[6,208],"Data":[7,209],"has":[8,105],"focussed":[9],"on":[10,60,120],"numerical":[11],"sources":[12],"information.":[14],"However,":[15],"with":[16,108,148,161,169,182],"`narrative":[18],"turn'":[19],"many":[21],"disciplines":[22],"gathering":[23],"pace":[24],"and":[25,66,79,124,145,178,196,222,237],"commercial":[26],"organisations":[27],"beginning":[28],"to":[29,88,153,220],"realise":[30],"value":[32],"their":[34],"textual":[35,58,90],"assets,":[36],"natural":[37],"language":[38,103,165,185,195],"data":[39,59,110,176,268],"is":[40,190,245],"fast":[41],"catching":[42],"up":[43],"as":[44,211],"an":[45],"exploitable":[46],"source":[47],"information":[49],"for":[50,111,134,175,233,270,278],"decision":[51],"making.":[52],"With":[53],"vast":[54],"quantities":[55],"unstructured":[57],"web,":[62],"social":[64],"media,":[65],"newly":[68],"digitised":[69],"historical":[70],"document":[71],"archives,":[72],"5Vs":[74],"(Volume,":[75],"Velocity,":[76],"Variety,":[77],"Value":[78],"Veracity)":[80],"apply":[81],"equally":[82],"well,":[83],"if":[84],"not":[85],"more":[86,279,283],"so,":[87],"big":[89,109],"data.":[91,186],"Corpus":[92,114],"linguistics,":[93],"computer-aided":[95],"study":[96],"large":[98],"collections":[99],"naturally":[101],"occurring":[102],"data,":[104],"been":[106],"dealing":[107,181],"fifty":[112],"years.":[113],"linguistics":[115],"methods":[116],"impose":[117],"complex":[118,173,284],"requirements":[119],"retrieval,":[122],"annotation":[123],"analysis":[125,238],"text":[127],"terms":[129,210],"displaying":[131],"narrow":[132],"contexts":[133],"each":[135],"occurrence":[136],"a":[138,212],"word":[139,200],"or":[140,151,202,282],"linguistic":[141],"feature":[142],"being":[143],"studied":[144],"counting":[146],"co-occurrences":[147],"other":[149],"words":[150,281],"features":[152,166],"determine":[154],"significant":[155],"patterns":[156],"language.":[158],"This,":[159],"coupled":[160],"distribution":[163],"accordance":[168],"Zipf's":[170],"Law,":[171],"poses":[172],"challenges":[174],"models":[177],"corpus":[179,250,267],"software":[180],"extreme":[183,265],"scale":[184,266],"A":[187],"related":[188],"issue":[189],"non-random":[192],"nature":[193],"`burstiness'":[198],"occurrences,":[201],"what":[203],"we":[204],"might":[205],"put":[206],"sixth":[213],"`V'":[214],"called":[215],"Viscosity.":[216],"We":[217,252],"report":[218],"experiments":[219],"examine":[221],"compare":[223],"capabilities":[225],"two":[227],"No-SQL":[228],"databases":[229],"clustered":[231],"configurations":[232],"indexing,":[235],"retrieval":[236],"billion-word":[240],"corpora,":[241],"since":[242],"this":[243,264],"size":[244],"current":[247],"state-of-the-art":[248],"linguistics.":[251],"find":[253],"that":[254],"modern":[255],"DBMSs":[256],"(Database":[257],"Management":[258],"Systems)":[259],"are":[260,274],"capable":[261],"handling":[263],"set":[269],"simple":[271],"queries":[272],"but":[273],"limited":[275],"when":[276],"querying":[277],"frequent":[280],"queries.":[285]},"counts_by_year":[{"year":2020,"cited_by_count":4},{"year":2016,"cited_by_count":1}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
