{"id":"https://openalex.org/W2117629987","doi":"https://doi.org/10.1145/1028099.1028101","title":"Analysis of lexical signatures for improving information persistence on the World Wide Web","display_name":"Analysis of lexical signatures for improving information persistence on the World Wide Web","publication_year":2004,"publication_date":"2004-10-01","ids":{"openalex":"https://openalex.org/W2117629987","doi":"https://doi.org/10.1145/1028099.1028101","mag":"2117629987"},"language":"en","primary_location":{"id":"doi:10.1145/1028099.1028101","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1028099.1028101","pdf_url":null,"source":{"id":"https://openalex.org/S4394735545","display_name":"ACM Transactions on Information Systems","issn_l":"1046-8188","issn":["1046-8188","1558-2868"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Information Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057019969","display_name":"Seung-Taek Park","orcid":null},"institutions":[{"id":"https://openalex.org/I4210134091","display_name":"Yahoo (United States)","ror":"https://ror.org/040dkzz12","country_code":"US","type":"company","lineage":["https://openalex.org/I4210134091"]},{"id":"https://openalex.org/I1325784139","display_name":"Yahoo (United Kingdom)","ror":"https://ror.org/038p3gq39","country_code":"GB","type":"company","lineage":["https://openalex.org/I1325784139","https://openalex.org/I4210134091"]}],"countries":["GB","US"],"is_corresponding":true,"raw_author_name":"Seung-Taek Park","raw_affiliation_strings":["Yahoo! Research Labs, Pasadena, CA","Yahoo! Research Labs, Pasadena, CA#TAB#"],"affiliations":[{"raw_affiliation_string":"Yahoo! Research Labs, Pasadena, CA","institution_ids":["https://openalex.org/I4210134091"]},{"raw_affiliation_string":"Yahoo! Research Labs, Pasadena, CA#TAB#","institution_ids":["https://openalex.org/I1325784139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106235930","display_name":"David M. Pennock","orcid":null},"institutions":[{"id":"https://openalex.org/I1325784139","display_name":"Yahoo (United Kingdom)","ror":"https://ror.org/038p3gq39","country_code":"GB","type":"company","lineage":["https://openalex.org/I1325784139","https://openalex.org/I4210134091"]},{"id":"https://openalex.org/I4210134091","display_name":"Yahoo (United States)","ror":"https://ror.org/040dkzz12","country_code":"US","type":"company","lineage":["https://openalex.org/I4210134091"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"David M. Pennock","raw_affiliation_strings":["Yahoo! Research Labs, Pasadena, CA","Yahoo! Research Labs, Pasadena, CA#TAB#"],"affiliations":[{"raw_affiliation_string":"Yahoo! Research Labs, Pasadena, CA","institution_ids":["https://openalex.org/I4210134091"]},{"raw_affiliation_string":"Yahoo! Research Labs, Pasadena, CA#TAB#","institution_ids":["https://openalex.org/I1325784139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001294898","display_name":"C. Lee Giles","orcid":"https://orcid.org/0000-0002-1931-585X"},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"C. Lee Giles","raw_affiliation_strings":["The Pennsylvania State University, University Park, PA","THE PENNSYLVANIA STATE UNIVERSITY, UNIVERSITY PARK, PA"],"affiliations":[{"raw_affiliation_string":"The Pennsylvania State University, University Park, PA","institution_ids":["https://openalex.org/I130769515"]},{"raw_affiliation_string":"THE PENNSYLVANIA STATE UNIVERSITY, UNIVERSITY PARK, PA","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5035365456","display_name":"Robert Krovetz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Robert Krovetz","raw_affiliation_strings":["Ask Jeeves, Piscataway, NJ","Ask Jeeves, Piscataway, NJ#TAB#"],"affiliations":[{"raw_affiliation_string":"Ask Jeeves, Piscataway, NJ","institution_ids":[]},{"raw_affiliation_string":"Ask Jeeves, Piscataway, NJ#TAB#","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5057019969"],"corresponding_institution_ids":["https://openalex.org/I1325784139","https://openalex.org/I4210134091"],"apc_list":null,"apc_paid":null,"fwci":4.0374,"has_fulltext":false,"cited_by_count":36,"citation_normalized_percentile":{"value":0.94267392,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":"22","issue":"4","first_page":"540","last_page":"572"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13976","display_name":"Web visibility and informetrics","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.9904000163078308,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/tf\u2013idf","display_name":"tf\u2013idf","score":0.8714609146118164},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8478773832321167},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6442734003067017},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5433597564697266},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.5357016921043396},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.45833733677864075},{"id":"https://openalex.org/keywords/generator","display_name":"Generator (circuit theory)","score":0.41257408261299133},{"id":"https://openalex.org/keywords/term","display_name":"Term (time)","score":0.3780971169471741},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3371099829673767},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3224589228630066},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.08134225010871887}],"concepts":[{"id":"https://openalex.org/C81758059","wikidata":"https://www.wikidata.org/wiki/Q796584","display_name":"tf\u2013idf","level":3,"score":0.8714609146118164},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8478773832321167},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6442734003067017},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5433597564697266},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.5357016921043396},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.45833733677864075},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.41257408261299133},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.3780971169471741},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3371099829673767},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3224589228630066},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.08134225010871887},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/1028099.1028101","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1028099.1028101","pdf_url":null,"source":{"id":"https://openalex.org/S4394735545","display_name":"ACM Transactions on Information Systems","issn_l":"1046-8188","issn":["1046-8188","1558-2868"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Information Systems","raw_type":"journal-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.95.2712","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.95.2712","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://clgiles.ist.psu.edu/papers/TOIS-2004-lexical_signatures.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/1","score":0.4099999964237213,"display_name":"No poverty"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320312062","display_name":"T\u00fcrkiye Atom Enerjisi Kurumu","ror":"https://ror.org/05hp77p11"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":45,"referenced_works":["https://openalex.org/W148561453","https://openalex.org/W1521329240","https://openalex.org/W1525293755","https://openalex.org/W1525661410","https://openalex.org/W1546842576","https://openalex.org/W1566984846","https://openalex.org/W1605217017","https://openalex.org/W1622499492","https://openalex.org/W1641167210","https://openalex.org/W1659541576","https://openalex.org/W1983416950","https://openalex.org/W1985554184","https://openalex.org/W2013207865","https://openalex.org/W2016522586","https://openalex.org/W2023097373","https://openalex.org/W2025243215","https://openalex.org/W2043200638","https://openalex.org/W2053584122","https://openalex.org/W2054024154","https://openalex.org/W2054567126","https://openalex.org/W2055289026","https://openalex.org/W2059713800","https://openalex.org/W2063994606","https://openalex.org/W2088429233","https://openalex.org/W2117850397","https://openalex.org/W2119221697","https://openalex.org/W2123466232","https://openalex.org/W2147164982","https://openalex.org/W2154498027","https://openalex.org/W2155711776","https://openalex.org/W2165028605","https://openalex.org/W2168190036","https://openalex.org/W2310264465","https://openalex.org/W2553945548","https://openalex.org/W2915063781","https://openalex.org/W2987772435","https://openalex.org/W2989631226","https://openalex.org/W4229912654","https://openalex.org/W4285719527","https://openalex.org/W6606075203","https://openalex.org/W6631082736","https://openalex.org/W6663913501","https://openalex.org/W6666947055","https://openalex.org/W6795354790","https://openalex.org/W6906355099"],"related_works":["https://openalex.org/W2382433580","https://openalex.org/W2100326285","https://openalex.org/W2369751049","https://openalex.org/W2971320423","https://openalex.org/W2198237484","https://openalex.org/W4296339319","https://openalex.org/W2381981226","https://openalex.org/W1998998501","https://openalex.org/W2383777945","https://openalex.org/W2365583997"],"abstract_inverted_index":{"A":[0],"&lt;i&gt;lexical":[1],"signature&lt;/i&gt;":[2],"(LS)":[3],"consisting":[4],"of":[5,34,49,141,164,223],"several":[6],"key":[7],"words":[8],"from":[9],"a":[10,30,67,72,202],"Web":[11,65],"document":[12,20,105,228],"is":[13,98],"often":[14,153],"sufficient":[15],"information":[16],"for":[17,37,195],"finding":[18,116,230],"the":[19,64,84,96,123,136,161,165,188,226,248],"later,":[21],"even":[22],"if":[23,95],"its":[24],"URL":[25],"has":[26],"changed.":[27],"We":[28,59,200],"conduct":[29],"large-scale":[31],"empirical":[32],"study":[33],"nine":[35],"methods":[36,177,194,220,240],"generating":[38,196],"lexical":[39,144,198],"signatures,":[40],"including":[41],"Phelps":[42],"and":[43,54,70,89,152,175,229],"Wilensky's":[44],"original":[45,85,97],"proposal":[46],"(PW),":[47],"seven":[48],"our":[50],"own":[51],"static":[52,193,219],"variations,":[53],"one":[55],"new":[56],"dynamic":[57,203],"method.":[58],"examine":[60],"their":[61,77],"performance":[62,243],"on":[63,71,122,135,160],"over":[66,233],"10-month":[68],"period,":[69],"TREC":[73,126],"data":[74,127],"set,":[75,128],"evaluating":[76],"ability":[78],"to":[79,103,133,150,186,211],"both":[80,224],"(1)":[81],"uniquely":[82],"identify":[83],"(possibly":[86],"modified)":[87],"document,":[88],"(2)":[90],"locate":[91],"other":[92],"relevant":[93,117,231],"documents":[94,246],"lost.":[99],"Lexical":[100],"signatures":[101,145],"chosen":[102],"minimize":[104],"frequency":[106],"(DF)":[107],"are":[108,147,157,250],"good":[109],"at":[110,115],"unique":[111],"identification":[112],"but":[113,129,156],"poor":[114],"documents.":[118,142],"PW":[119],"works":[120],"well":[121],"relatively":[124],"small":[125],"acts":[130],"almost":[131],"identically":[132],"DF":[134,180],"Web,":[137],"which":[138],"contains":[139],"billions":[140],"Term-frequency-based":[143],"(TF)":[146],"very":[148],"easy":[149],"compute":[151],"perform":[154],"well,":[155],"highly":[158],"dependent":[159],"ranking":[162],"system":[163],"search":[166,236],"engine":[167],"used.":[168],"The":[169],"term-frequency":[170],"inverse-document-frequency-":[171],"(TFIDF-)":[172],"based":[173],"method":[174],"hybrid":[176],"(which":[178],"combine":[179],"with":[181],"TF":[182],"or":[183],"TFIDF)":[184],"seem":[185],"be":[187],"most":[189],"promising":[190],"candidates":[191],"among":[192],"effective":[197],"signatures.":[199],"propose":[201],"LS":[204,213,239],"generator":[205],"called":[206],"&lt;i&gt;Test":[207],"&amp;":[208],"Select&lt;/i&gt;":[209],"(TS)":[210],"mitigate":[212],"conflict.":[214],"TS":[215],"outperforms":[216],"all":[217],"eight":[218],"in":[221,247],"terms":[222],"extracting":[225],"desired":[227],"information,":[232],"three":[234],"different":[235],"engines.":[237],"All":[238],"show":[241],"significant":[242],"degradation":[244],"as":[245],"corpus":[249],"edited.":[251]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":2},{"year":2016,"cited_by_count":3},{"year":2014,"cited_by_count":2},{"year":2013,"cited_by_count":1},{"year":2012,"cited_by_count":2}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
