{"id":"https://openalex.org/W2982541428","doi":"https://doi.org/10.1109/nextcomp.2019.8883665","title":"Categorising AWS Common Crawl Dataset using MapReduce","display_name":"Categorising AWS Common Crawl Dataset using MapReduce","publication_year":2019,"publication_date":"2019-09-01","ids":{"openalex":"https://openalex.org/W2982541428","doi":"https://doi.org/10.1109/nextcomp.2019.8883665","mag":"2982541428"},"language":"en","primary_location":{"id":"doi:10.1109/nextcomp.2019.8883665","is_oa":false,"landing_page_url":"https://doi.org/10.1109/nextcomp.2019.8883665","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 Conference on Next Generation Computing Applications (NextComp)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038291234","display_name":"Aatish Chiniah","orcid":"https://orcid.org/0000-0003-0045-2915"},"institutions":[{"id":"https://openalex.org/I69072986","display_name":"University of Mauritius","ror":"https://ror.org/05cyprz33","country_code":"MU","type":"education","lineage":["https://openalex.org/I69072986"]}],"countries":["MU"],"is_corresponding":true,"raw_author_name":"Aatish Chiniah","raw_affiliation_strings":["Department of Digital Technologies, University of Mauritius, Reduit, Mauritius"],"affiliations":[{"raw_affiliation_string":"Department of Digital Technologies, University of Mauritius, Reduit, Mauritius","institution_ids":["https://openalex.org/I69072986"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042805537","display_name":"Ayaz Chummun","orcid":null},"institutions":[{"id":"https://openalex.org/I69072986","display_name":"University of Mauritius","ror":"https://ror.org/05cyprz33","country_code":"MU","type":"education","lineage":["https://openalex.org/I69072986"]}],"countries":["MU"],"is_corresponding":false,"raw_author_name":"Ayaz Chummun","raw_affiliation_strings":["Department of Digital Technologies, University of Mauritius, Reduit, Mauritius"],"affiliations":[{"raw_affiliation_string":"Department of Digital Technologies, University of Mauritius, Reduit, Mauritius","institution_ids":["https://openalex.org/I69072986"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069866070","display_name":"Zaid Burkutally","orcid":null},"institutions":[{"id":"https://openalex.org/I69072986","display_name":"University of Mauritius","ror":"https://ror.org/05cyprz33","country_code":"MU","type":"education","lineage":["https://openalex.org/I69072986"]}],"countries":["MU"],"is_corresponding":false,"raw_author_name":"Zaid Burkutally","raw_affiliation_strings":["Department of Digital Technologies, University of Mauritius, Reduit, Mauritius"],"affiliations":[{"raw_affiliation_string":"Department of Digital Technologies, University of Mauritius, Reduit, Mauritius","institution_ids":["https://openalex.org/I69072986"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5038291234"],"corresponding_institution_ids":["https://openalex.org/I69072986"],"apc_list":null,"apc_paid":null,"fwci":0.28,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.66476454,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":"1","issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8738422393798828},{"id":"https://openalex.org/keywords/upload","display_name":"Upload","score":0.761927604675293},{"id":"https://openalex.org/keywords/directory","display_name":"Directory","score":0.7432308197021484},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6341196894645691},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.6114181876182556},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.5088217854499817},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5051313042640686},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.48523449897766113},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.4221826195716858},{"id":"https://openalex.org/keywords/directory-service","display_name":"Directory service","score":0.4153668284416199},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.36240333318710327},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.1114371120929718}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8738422393798828},{"id":"https://openalex.org/C71901391","wikidata":"https://www.wikidata.org/wiki/Q7126699","display_name":"Upload","level":2,"score":0.761927604675293},{"id":"https://openalex.org/C2777683733","wikidata":"https://www.wikidata.org/wiki/Q201456","display_name":"Directory","level":2,"score":0.7432308197021484},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6341196894645691},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.6114181876182556},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.5088217854499817},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5051313042640686},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.48523449897766113},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.4221826195716858},{"id":"https://openalex.org/C138338577","wikidata":"https://www.wikidata.org/wiki/Q756230","display_name":"Directory service","level":3,"score":0.4153668284416199},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.36240333318710327},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.1114371120929718},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/nextcomp.2019.8883665","is_oa":false,"landing_page_url":"https://doi.org/10.1109/nextcomp.2019.8883665","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 Conference on Next Generation Computing Applications (NextComp)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W1504741607","https://openalex.org/W1999299681","https://openalex.org/W2053270317","https://openalex.org/W2401153089","https://openalex.org/W2771694204","https://openalex.org/W6712867312"],"related_works":["https://openalex.org/W181118223","https://openalex.org/W73423766","https://openalex.org/W4230960446","https://openalex.org/W2106753613","https://openalex.org/W2104608056","https://openalex.org/W2005960013","https://openalex.org/W2143307242","https://openalex.org/W2136877416","https://openalex.org/W2767128237","https://openalex.org/W2160164485"],"abstract_inverted_index":{"Keeping":[0],"track":[1],"of":[2,17,66,173,202,210],"websites":[3,40,48,75,226],"connected":[4],"to":[5,33,117,134,155,169],"the":[6,13,31,39,46,52,64,86,99,121,130,138,157,171,174,197],"Web":[7,54],"is":[8,127,146],"an":[9,217],"impossible":[10],"task":[11,32],"given":[12],"amplitude":[14],"and":[15,22,69,84,96,163,188,223],"fluctuation":[16],"new":[18,124,131],"sites":[19],"being":[20],"created":[21],"those":[23],"going":[24],"offline.":[25],"In":[26,167],"this":[27],"paper":[28],"we":[29],"took":[30],"create":[34],"a":[35,115,123,149],"directory":[36,164],"by":[37,159],"categorising":[38],"using":[41],"MapReduce.":[42,189],"The":[43,140],"dataset":[44,126],"about":[45,73,142],"different":[47],"are":[49,61,77,90,101,153,183],"collected":[50],"from":[51,120,216],"Amazon":[53],"Service":[55],"(AWS)":[56],"Common":[57],"Crawl":[58],"dataset.":[59],"Datasets":[60],"released":[62],"at":[63],"end":[65],"each":[67,88,118,143],"month":[68],"it":[70],"contains":[71],"information":[72],"all":[74],"that":[76,80,194],"live":[78],"during":[79],"month.":[81],"After":[82,112],"mining":[83],"translating":[85],"dataset,":[87,122],"URL":[89,119,132,144],"categorised":[91,125,222],"based":[92,165],"on":[93,137,207],"its":[94],"keywords":[95],"hosted":[97],"country,":[98],"URLs":[100,215],"then":[102],"placed":[103],"in":[104,200],"categories":[105,162],"such":[106],"as":[107],"Art,":[108],"Education,":[109],"Shopping":[110],"etc...":[111],"successfully":[113,221,228],"assigning":[114],"category":[116],"updated":[128],"with":[129],"objects":[133],"be":[135],"uploaded":[136],"cloud.":[139],"data":[141,158],"object":[145],"presented":[147],"through":[148],"website":[150],"whereby":[151],"users":[152],"allowed":[154],"browse":[156],"both":[160],"their":[161],"domains.":[166],"order":[168],"evaluate":[170],"performance":[172],"categorisation":[175],"process,":[176],"three":[177],"techniques":[178],"have":[179],"been":[180,192],"used":[181],"which":[182],"Java":[184],"8":[185],"Streams,":[186],"Multi-Threading":[187],"It":[190],"has":[191],"observed":[193],"MapReduce":[195],"was":[196],"most":[198],"performing":[199],"terms":[201],"execution":[203],"time,":[204],"resource":[205],"usage":[206],"Hadoop":[208],"Cluster":[209],"seven":[211],"nodes.":[212],"23576":[213],"(90.3%)":[214],"AWS":[218],"datasets":[219],"were":[220,227],"5767(22.8%)":[224],"non-English":[225],"translated.":[229]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
