{"id":"https://openalex.org/W1984868580","doi":"https://doi.org/10.1145/2435215.2435218","title":"A Comprehensive Study of Techniques for URL-Based Web Page Language Classification","display_name":"A Comprehensive Study of Techniques for URL-Based Web Page Language Classification","publication_year":2013,"publication_date":"2013-03-01","ids":{"openalex":"https://openalex.org/W1984868580","doi":"https://doi.org/10.1145/2435215.2435218","mag":"1984868580"},"language":"en","primary_location":{"id":"doi:10.1145/2435215.2435218","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2435215.2435218","pdf_url":null,"source":{"id":"https://openalex.org/S131231701","display_name":"ACM Transactions on the Web","issn_l":"1559-1131","issn":["1559-1131","1559-114X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on the Web","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048104591","display_name":"Eda Baykan","orcid":null},"institutions":[{"id":"https://openalex.org/I132257509","display_name":"Izmir University","ror":"https://ror.org/013h3xr51","country_code":"TR","type":"education","lineage":["https://openalex.org/I132257509"]}],"countries":["TR"],"is_corresponding":true,"raw_author_name":"Eda Baykan","raw_affiliation_strings":["Izmir University","Izmir University#TAB#"],"affiliations":[{"raw_affiliation_string":"Izmir University","institution_ids":["https://openalex.org/I132257509"]},{"raw_affiliation_string":"Izmir University#TAB#","institution_ids":["https://openalex.org/I132257509"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003641345","display_name":"Monika Henzinger","orcid":"https://orcid.org/0000-0002-5008-6530"},"institutions":[{"id":"https://openalex.org/I129774422","display_name":"University of Vienna","ror":"https://ror.org/03prydq77","country_code":"AT","type":"education","lineage":["https://openalex.org/I129774422"]}],"countries":["AT"],"is_corresponding":false,"raw_author_name":"Monika Henzinger","raw_affiliation_strings":["University of Vienna","Univ. of Vienna#TAB#"],"affiliations":[{"raw_affiliation_string":"University of Vienna","institution_ids":["https://openalex.org/I129774422"]},{"raw_affiliation_string":"Univ. of Vienna#TAB#","institution_ids":["https://openalex.org/I129774422"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5033656008","display_name":"Ingmar Weber","orcid":"https://orcid.org/0000-0003-4169-2579"},"institutions":[{"id":"https://openalex.org/I2800095910","display_name":"Yahoo (Spain)","ror":"https://ror.org/03gq8sg42","country_code":"ES","type":"company","lineage":["https://openalex.org/I2800095910","https://openalex.org/I4210134091"]},{"id":"https://openalex.org/I1325784139","display_name":"Yahoo (United Kingdom)","ror":"https://ror.org/038p3gq39","country_code":"GB","type":"company","lineage":["https://openalex.org/I1325784139","https://openalex.org/I4210134091"]}],"countries":["ES","GB"],"is_corresponding":false,"raw_author_name":"Ingmar Weber","raw_affiliation_strings":["Yahoo! Research Barcelona","Yahoo Research, Barcelona"],"affiliations":[{"raw_affiliation_string":"Yahoo! Research Barcelona","institution_ids":["https://openalex.org/I2800095910"]},{"raw_affiliation_string":"Yahoo Research, Barcelona","institution_ids":["https://openalex.org/I1325784139"]}]}],"institutions":[],"countries_distinct_count":4,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5048104591"],"corresponding_institution_ids":["https://openalex.org/I132257509"],"apc_list":null,"apc_paid":null,"fwci":4.8537,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.94842312,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":"7","issue":"1","first_page":"1","last_page":"37"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8615072965621948},{"id":"https://openalex.org/keywords/upload","display_name":"Upload","score":0.7080931067466736},{"id":"https://openalex.org/keywords/web-crawler","display_name":"Web crawler","score":0.663032591342926},{"id":"https://openalex.org/keywords/directory","display_name":"Directory","score":0.6246612071990967},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.6242033839225769},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5390737652778625},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.431915283203125},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4211897850036621},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.38024505972862244}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8615072965621948},{"id":"https://openalex.org/C71901391","wikidata":"https://www.wikidata.org/wiki/Q7126699","display_name":"Upload","level":2,"score":0.7080931067466736},{"id":"https://openalex.org/C13743948","wikidata":"https://www.wikidata.org/wiki/Q45842","display_name":"Web crawler","level":2,"score":0.663032591342926},{"id":"https://openalex.org/C2777683733","wikidata":"https://www.wikidata.org/wiki/Q201456","display_name":"Directory","level":2,"score":0.6246612071990967},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.6242033839225769},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5390737652778625},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.431915283203125},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4211897850036621},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38024505972862244},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2435215.2435218","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2435215.2435218","pdf_url":null,"source":{"id":"https://openalex.org/S131231701","display_name":"ACM Transactions on the Web","issn_l":"1559-1131","issn":["1559-1131","1559-114X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on the Web","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7099999785423279}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W183313898","https://openalex.org/W195496810","https://openalex.org/W202303397","https://openalex.org/W204010288","https://openalex.org/W1480376833","https://openalex.org/W1489992655","https://openalex.org/W1490796714","https://openalex.org/W1533946607","https://openalex.org/W1577899347","https://openalex.org/W1595552350","https://openalex.org/W1598996557","https://openalex.org/W1723398785","https://openalex.org/W1856393130","https://openalex.org/W1966660365","https://openalex.org/W2008781077","https://openalex.org/W2051461667","https://openalex.org/W2058113710","https://openalex.org/W2059166475","https://openalex.org/W2067698488","https://openalex.org/W2070771761","https://openalex.org/W2105842272","https://openalex.org/W2110520505","https://openalex.org/W2118020653","https://openalex.org/W2122661071","https://openalex.org/W2128294438","https://openalex.org/W2134491992","https://openalex.org/W2152805927","https://openalex.org/W2154045882","https://openalex.org/W2181655653","https://openalex.org/W2214336058","https://openalex.org/W2612649659","https://openalex.org/W2794798126","https://openalex.org/W2952033050","https://openalex.org/W3010318089","https://openalex.org/W4298164899","https://openalex.org/W6608351394"],"related_works":["https://openalex.org/W3115906952","https://openalex.org/W3134811395","https://openalex.org/W2110379974","https://openalex.org/W4385695127","https://openalex.org/W2389761961","https://openalex.org/W2963706618","https://openalex.org/W2113184419","https://openalex.org/W1999548128","https://openalex.org/W4254300012","https://openalex.org/W2102475112"],"abstract_inverted_index":{"Given":[0],"only":[1],"the":[2,26,29,37,124,144,157,164,171,178,203,206,209,218,224,235,254,270],"URL":[3],"of":[4,28,42,62,86,123,180,187,198,208,223,230],"a":[5,40,60,135,140,151,185,199,228,239,259],"Web":[6,30,126,188,210],"page,":[7],"can":[8],"we":[9,16,68,90],"identify":[10],"its":[11],"language?":[12],"In":[13,202,232],"this":[14,18],"article":[15],"examine":[17],"question.":[19],"URL-based":[20,48],"language":[21,49,84,226],"classification":[22,79,112,119],"is":[23,32,39,212],"useful":[24],"when":[25],"content":[27,38,207],"page":[31,211,220],"not":[33],"available":[34],"or":[35],"downloading":[36,221],"waste":[41,229],"bandwidth":[43],"and":[44,56,64,80,96,118,130,148,163,194,216,249,258,265],"time.":[45],"We":[46,103,128,155,175],"built":[47],"classifiers":[50,133,237],"for":[51,77,83,160,167,253,269],"English,":[52],"German,":[53],"French,":[54],"Spanish,":[55],"Italian":[57],"by":[58,113,120],"applying":[59],"variety":[61],"algorithms":[63,67,72,82],"features.":[65],"As":[66,88],"used":[69,91],"machine":[70],"learning":[71],"which":[73],"are":[74],"widely":[75],"applied":[76],"text":[78],"state-of-art":[81],"identification":[85],"text.":[87],"features":[89,98],"words,":[92],"various":[93],"sized":[94],"n-grams,":[95],"custom-made":[97],"(our":[99],"novel":[100],"feature":[101],"set).":[102],"compared":[104],"our":[105,132,181],"approaches":[106],"with":[107,170,242],"two":[108],"baseline":[109],"methods,":[110],"namely":[111],"country":[114],"code":[115],"top-level":[116],"domains":[117],"IP":[121],"addresses":[122],"hosting":[125],"servers.":[127],"trained":[129],"tested":[131],"in":[134,191,217],"10-fold":[136],"cross-validation":[137],"setup":[138],"on":[139,184],"dataset":[141],"obtained":[142,156],"from":[143,149],"Open":[145],"Directory":[146],"Project":[147],"querying":[150],"commercial":[152],"search":[153],"engine.":[154],"lowest":[158],"F1-measure":[159,166,244],"English":[161],"(94)":[162],"highest":[165],"German":[168],"(98)":[169],"best":[172,236],"performing":[173],"classifiers.":[174],"also":[176],"evaluated":[177],"performance":[179],"methods:":[182],"(i)":[183],"set":[186],"pages":[189,222,257],"written":[190],"Adobe":[192,255],"Flash":[193,256],"(ii)":[195],"as":[196],"part":[197],"language-focused":[200,271],"crawler.":[201,272],"first":[204],"case,":[205],"hard":[213],"to":[214],"extract":[215],"second":[219],"\u201cwrong\u201d":[225],"constitutes":[227],"bandwidth.":[231],"both":[233],"settings":[234],"have":[238],"high":[240],"accuracy":[241],"an":[243],"between":[245,261],"95":[246],"(for":[247,251,263,267],"English)":[248],"98":[250],"Italian)":[252,264],"precision":[260],"90":[262],"97":[266],"French)":[268]},"counts_by_year":[{"year":2022,"cited_by_count":2},{"year":2019,"cited_by_count":2},{"year":2018,"cited_by_count":3},{"year":2017,"cited_by_count":2},{"year":2016,"cited_by_count":3},{"year":2015,"cited_by_count":2},{"year":2014,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
