{"id":"https://openalex.org/W2170907470","doi":"https://doi.org/10.1145/1242572.1242628","title":"Efficient search in large textual collections with redundancy","display_name":"Efficient search in large textual collections with redundancy","publication_year":2007,"publication_date":"2007-05-08","ids":{"openalex":"https://openalex.org/W2170907470","doi":"https://doi.org/10.1145/1242572.1242628","mag":"2170907470"},"language":"en","primary_location":{"id":"doi:10.1145/1242572.1242628","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1242572.1242628","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 16th international conference on World Wide Web","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103193581","display_name":"Jiangong Zhang","orcid":"https://orcid.org/0000-0002-3429-3652"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jiangong Zhang","raw_affiliation_strings":["Polytechnic University, Brooklyn, NY"],"affiliations":[{"raw_affiliation_string":"Polytechnic University, Brooklyn, NY","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5074323303","display_name":"Torsten Suel","orcid":"https://orcid.org/0000-0002-8324-980X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Torsten Suel","raw_affiliation_strings":["Polytechnic University, Brooklyn, NY"],"affiliations":[{"raw_affiliation_string":"Polytechnic University, Brooklyn, NY","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5103193581"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":13.4513,"has_fulltext":false,"cited_by_count":31,"citation_normalized_percentile":{"value":0.98512255,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"411","last_page":"420"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.856356143951416},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6880546808242798},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.6466659307479858},{"id":"https://openalex.org/keywords/search-engine","display_name":"Search engine","score":0.609398603439331},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.5944634079933167},{"id":"https://openalex.org/keywords/web-search-engine","display_name":"Web search engine","score":0.5736905336380005},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.540781557559967},{"id":"https://openalex.org/keywords/web-search-query","display_name":"Web search query","score":0.529181182384491},{"id":"https://openalex.org/keywords/search-oriented-architecture","display_name":"Search-oriented architecture","score":0.470805287361145},{"id":"https://openalex.org/keywords/index","display_name":"Index (typography)","score":0.45595505833625793},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.4503898322582245},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.4384479820728302},{"id":"https://openalex.org/keywords/web-query-classification","display_name":"Web query classification","score":0.3467564284801483},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.1006510853767395}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.856356143951416},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6880546808242798},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.6466659307479858},{"id":"https://openalex.org/C97854310","wikidata":"https://www.wikidata.org/wiki/Q19541","display_name":"Search engine","level":2,"score":0.609398603439331},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.5944634079933167},{"id":"https://openalex.org/C521815418","wikidata":"https://www.wikidata.org/wiki/Q4182287","display_name":"Web search engine","level":4,"score":0.5736905336380005},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.540781557559967},{"id":"https://openalex.org/C164120249","wikidata":"https://www.wikidata.org/wiki/Q995982","display_name":"Web search query","level":3,"score":0.529181182384491},{"id":"https://openalex.org/C157154645","wikidata":"https://www.wikidata.org/wiki/Q7441612","display_name":"Search-oriented architecture","level":5,"score":0.470805287361145},{"id":"https://openalex.org/C2777382242","wikidata":"https://www.wikidata.org/wiki/Q6017816","display_name":"Index (typography)","level":2,"score":0.45595505833625793},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.4503898322582245},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.4384479820728302},{"id":"https://openalex.org/C118689300","wikidata":"https://www.wikidata.org/wiki/Q7978614","display_name":"Web query classification","level":4,"score":0.3467564284801483},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.1006510853767395},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/1242572.1242628","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1242572.1242628","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 16th international conference on World Wide Web","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W38366299","https://openalex.org/W191231183","https://openalex.org/W841409518","https://openalex.org/W1483194439","https://openalex.org/W1544843123","https://openalex.org/W1556744446","https://openalex.org/W1575155361","https://openalex.org/W1576397915","https://openalex.org/W1660390307","https://openalex.org/W1669813703","https://openalex.org/W1972418517","https://openalex.org/W1975868314","https://openalex.org/W1993865637","https://openalex.org/W2026095310","https://openalex.org/W2026839093","https://openalex.org/W2029500199","https://openalex.org/W2029673812","https://openalex.org/W2038807029","https://openalex.org/W2042130547","https://openalex.org/W2046862025","https://openalex.org/W2050140640","https://openalex.org/W2056980397","https://openalex.org/W2066636486","https://openalex.org/W2108572348","https://openalex.org/W2111295912","https://openalex.org/W2122416857","https://openalex.org/W2125203709","https://openalex.org/W2126353995","https://openalex.org/W2128308442","https://openalex.org/W2132627996","https://openalex.org/W2138662031","https://openalex.org/W2152565070","https://openalex.org/W2154610494","https://openalex.org/W2156417525","https://openalex.org/W2157022538","https://openalex.org/W2158626901","https://openalex.org/W2160484851","https://openalex.org/W2164774161","https://openalex.org/W2169189540","https://openalex.org/W2621280964","https://openalex.org/W4243255773","https://openalex.org/W4250366158"],"related_works":["https://openalex.org/W1978230837","https://openalex.org/W2169307587","https://openalex.org/W145118210","https://openalex.org/W2980563896","https://openalex.org/W174342267","https://openalex.org/W1642605914","https://openalex.org/W2533706070","https://openalex.org/W2185998359","https://openalex.org/W2406286368","https://openalex.org/W2363322605"],"abstract_inverted_index":{"Current":[0],"web":[1,180,186],"search":[2,21,181,184,197],"engines":[3,69],"focus":[4],"on":[5,131,196],"searching":[6],"only":[7],"themost":[8],"recentsnapshot":[9],"of":[10,30,36,51,59,90,107,118,185],"the":[11,40,143],"web.":[12],"In":[13],"some":[14],"cases,":[15],"however,":[16],"it":[17,135],"would":[18],"be":[19,141],"desirableto":[20],"over":[22,154],"collections":[23,109],"that":[24,58,165],"include":[25],"many":[26,46],"different":[27,88,163],"crawls":[28],"andversions":[29],"each":[31],"page.":[32],"One":[33],"important":[34],"example":[35],"such":[37,52,132],"a":[38,60,91,100,115,155,202],"collectionis":[39],"Internet":[41],"Archive,":[42],"though":[43],"there":[44],"are":[45],"others.":[47],"Sincethe":[48],"data":[49],"size":[50,127,168],"an":[53],"archive":[54],"is":[55,136],"multiple":[56,206],"times":[57],"singlesnapshot,":[61],"this":[62,96,158],"presents":[63],"us":[64],"with":[65,114],"significant":[66,123],"performance":[67],"challenges.Current":[68],"use":[70],"various":[71],"techniques":[72,81],"for":[73,103],"index":[74,126],"compression":[75],"andoptimized":[76],"query":[77,129,199],"execution,":[78],"but":[79],"these":[80],"do":[82],"not":[83],"exploit":[84],"thesignificant":[85],"similarities":[86],"between":[87],"versions":[89],"page,":[92],"or":[93,189],"betweendifferent":[94],"pages.In":[95],"paper,":[97],"we":[98],"propose":[99],"general":[101],"framework":[102],"indexing":[104],"andquery":[105],"processing":[106],"archival":[108,179],"and,":[110],"more":[111],"generally,":[112],"anycollections":[113],"sufficient":[116],"amount":[117],"redundancy.":[119],"Our":[120],"approachresults":[121],"in":[122,125],"reductions":[124],"and":[128,134,139,153,161,172,175,201],"processingcosts":[130],"collections,":[133],"orthogonal":[137],"to":[138,182],"can":[140],"combinedwith":[142],"existing":[144],"techniques.":[145],"It":[146],"also":[147],"supports":[148],"highly":[149],"efficientupdates,":[150],"both":[151],"locally":[152],"network.":[156],"Within":[157],"framework,we":[159],"describe":[160],"evaluate":[162],"implementations":[164],"trade":[166],"offindex":[167],"versus":[169],"CPU":[170],"cost":[171],"other":[173],"factors,":[174],"discuss":[176],"applicationsranging":[177],"from":[178],"local":[183],"sites,email":[187],"archives,":[188],"file":[190],"systems.":[191],"We":[192],"present":[193],"experimental":[194],"resultsbased":[195],"engine":[198],"log":[200],"large":[203],"collection":[204],"consistingof":[205],"crawls.":[207]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2016,"cited_by_count":3},{"year":2015,"cited_by_count":1},{"year":2014,"cited_by_count":1},{"year":2013,"cited_by_count":1},{"year":2012,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
