{"id":"https://openalex.org/W2012422639","doi":"https://doi.org/10.1145/1531914.1531920","title":"Web spam identification through language model analysis","display_name":"Web spam identification through language model analysis","publication_year":2009,"publication_date":"2009-04-21","ids":{"openalex":"https://openalex.org/W2012422639","doi":"https://doi.org/10.1145/1531914.1531920","mag":"2012422639"},"language":"en","primary_location":{"id":"doi:10.1145/1531914.1531920","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1531914.1531920","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 5th International Workshop on Adversarial Information Retrieval on the Web","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061310617","display_name":"Juan Mart\u00ednez-Romo","orcid":"https://orcid.org/0000-0002-6905-7051"},"institutions":[{"id":"https://openalex.org/I178450904","display_name":"Universidad Nacional de Educaci\u00f3n a Distancia","ror":"https://ror.org/02msb5n36","country_code":"ES","type":"education","lineage":["https://openalex.org/I178450904"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Juan Martinez-Romo","raw_affiliation_strings":["UNED, Madrid, Spain","UNED, Madrid, SPAIN"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"UNED, Madrid, Spain","institution_ids":["https://openalex.org/I178450904"]},{"raw_affiliation_string":"UNED, Madrid, SPAIN","institution_ids":["https://openalex.org/I178450904"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5076188730","display_name":"Lourdes Araujo","orcid":"https://orcid.org/0000-0002-7657-4794"},"institutions":[{"id":"https://openalex.org/I178450904","display_name":"Universidad Nacional de Educaci\u00f3n a Distancia","ror":"https://ror.org/02msb5n36","country_code":"ES","type":"education","lineage":["https://openalex.org/I178450904"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Lourdes Araujo","raw_affiliation_strings":["UNED, Madrid, Spain","UNED, Madrid, SPAIN"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"UNED, Madrid, Spain","institution_ids":["https://openalex.org/I178450904"]},{"raw_affiliation_string":"UNED, Madrid, SPAIN","institution_ids":["https://openalex.org/I178450904"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":19.4108,"has_fulltext":false,"cited_by_count":58,"citation_normalized_percentile":{"value":0.99078217,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"21","last_page":"28"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8367849588394165},{"id":"https://openalex.org/keywords/hyperlink","display_name":"Hyperlink","score":0.8134738206863403},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5940923094749451},{"id":"https://openalex.org/keywords/relation","display_name":"Relation (database)","score":0.5665546655654907},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.5632557272911072},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.5417507290840149},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5414472222328186},{"id":"https://openalex.org/keywords/link-analysis","display_name":"Link analysis","score":0.5047053098678589},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.47682374715805054},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.4539099633693695},{"id":"https://openalex.org/keywords/spamdexing","display_name":"Spamdexing","score":0.4240644574165344},{"id":"https://openalex.org/keywords/web-navigation","display_name":"Web navigation","score":0.32437562942504883},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.22816306352615356},{"id":"https://openalex.org/keywords/web-search-engine","display_name":"Web search engine","score":0.14233490824699402}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8367849588394165},{"id":"https://openalex.org/C30088001","wikidata":"https://www.wikidata.org/wiki/Q102014","display_name":"Hyperlink","level":3,"score":0.8134738206863403},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5940923094749451},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.5665546655654907},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.5632557272911072},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.5417507290840149},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5414472222328186},{"id":"https://openalex.org/C1173588","wikidata":"https://www.wikidata.org/wiki/Q6554294","display_name":"Link analysis","level":2,"score":0.5047053098678589},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.47682374715805054},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.4539099633693695},{"id":"https://openalex.org/C13565553","wikidata":"https://www.wikidata.org/wiki/Q804206","display_name":"Spamdexing","level":5,"score":0.4240644574165344},{"id":"https://openalex.org/C61096286","wikidata":"https://www.wikidata.org/wiki/Q7978592","display_name":"Web navigation","level":3,"score":0.32437562942504883},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.22816306352615356},{"id":"https://openalex.org/C521815418","wikidata":"https://www.wikidata.org/wiki/Q4182287","display_name":"Web search engine","level":4,"score":0.14233490824699402},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C59822182","wikidata":"https://www.wikidata.org/wiki/Q441","display_name":"Botany","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/1531914.1531920","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1531914.1531920","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 5th International Workshop on Adversarial Information Retrieval on the Web","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.469.132","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.469.132","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://airweb.cse.lehigh.edu/2009/papers/p21-martinez-romo.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W202878612","https://openalex.org/W1527781663","https://openalex.org/W1570448133","https://openalex.org/W1845137714","https://openalex.org/W1984160751","https://openalex.org/W1986831811","https://openalex.org/W1987365175","https://openalex.org/W2066055909","https://openalex.org/W2073562093","https://openalex.org/W2093390569","https://openalex.org/W2099111195","https://openalex.org/W2100081831","https://openalex.org/W2107428549","https://openalex.org/W2115394986","https://openalex.org/W2121308534","https://openalex.org/W2130196635","https://openalex.org/W2156632103","https://openalex.org/W2169270715","https://openalex.org/W2401383085","https://openalex.org/W2966207845","https://openalex.org/W4206765718","https://openalex.org/W6631431479","https://openalex.org/W6712844215"],"related_works":["https://openalex.org/W2245616560","https://openalex.org/W2116757369","https://openalex.org/W2583864867","https://openalex.org/W2351804282","https://openalex.org/W2048998278","https://openalex.org/W1556894713","https://openalex.org/W1490416172","https://openalex.org/W158130761","https://openalex.org/W2588706232","https://openalex.org/W2916824151"],"abstract_inverted_index":{"This":[0],"paper":[1],"applies":[2],"a":[3,14,34,44,59,68,122,131],"language":[4,100],"model":[5],"approach":[6],"to":[7,19,64,97],"different":[8,54,104],"sources":[9,55,92],"of":[10,27,56,58,67,90,93,106,119,137],"information":[11,57,94],"extracted":[12],"from":[13],"Web":[15,28,60,138],"page,":[16],"in":[17,24,95,111,125],"order":[18,96],"provide":[20],"high":[21],"quality":[22],"indicators":[23],"the":[25,65,80,103,135],"detection":[26,136],"Spam.":[29],"Two":[30],"pages":[31],"linked":[32,84],"by":[33],"hyperlink":[35],"should":[36],"be":[37],"topically":[38],"related,":[39],"even":[40],"though":[41],"this":[42,49],"were":[43],"weak":[45],"contextual":[46],"relation.":[47],"For":[48],"reason":[50],"we":[51,71,87,114],"have":[52,72],"analysed":[53],"page":[61],"that":[62,133],"belongs":[63],"context":[66],"link":[69],"and":[70,108,143,149],"applied":[73],"Kullback-Leibler":[74],"divergence":[75],"on":[76,140],"them":[77],"for":[78],"characterising":[79],"relationship":[81],"between":[82],"two":[83,141],"pages.":[85],"Moreover,":[86],"combine":[88],"some":[89],"these":[91,117],"obtain":[98],"richer":[99],"models.":[101],"Given":[102],"nature":[105],"internal":[107],"external":[109],"links,":[110],"our":[112],"study":[113],"also":[115],"distinguished":[116],"types":[118],"links":[120],"getting":[121],"significant":[123],"improvement":[124],"classification":[126],"tasks.":[127],"The":[128],"result":[129],"is":[130],"system":[132],"improves":[134],"Spam":[139],"large":[142],"public":[144],"datasets":[145],"such":[146],"as":[147],"WEBSPAM-UK2006":[148],"WEBSPAM-UK2007.":[150]},"counts_by_year":[{"year":2023,"cited_by_count":1},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":4},{"year":2018,"cited_by_count":7},{"year":2017,"cited_by_count":1},{"year":2016,"cited_by_count":1},{"year":2015,"cited_by_count":4},{"year":2014,"cited_by_count":4},{"year":2013,"cited_by_count":10},{"year":2012,"cited_by_count":10}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
