{"id":"https://openalex.org/W2095007791","doi":"https://doi.org/10.1145/2806416.2806581","title":"Ranking Deep Web Text Collections for Scalable Information Extraction","display_name":"Ranking Deep Web Text Collections for Scalable Information Extraction","publication_year":2015,"publication_date":"2015-10-17","ids":{"openalex":"https://openalex.org/W2095007791","doi":"https://doi.org/10.1145/2806416.2806581","mag":"2095007791"},"language":"en","primary_location":{"id":"doi:10.1145/2806416.2806581","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2806416.2806581","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 24th ACM International on Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://biblio.ugent.be/publication/7235609/file/7235611.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5074134679","display_name":"Pablo Barrio","orcid":"https://orcid.org/0000-0002-4410-0682"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Pablo Barrio","raw_affiliation_strings":["Columbia University, New York, NY, USA"],"affiliations":[{"raw_affiliation_string":"Columbia University, New York, NY, USA","institution_ids":["https://openalex.org/I78577930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080063580","display_name":"Luis Gravano","orcid":null},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Luis Gravano","raw_affiliation_strings":["Columbia University, New York, NY, USA"],"affiliations":[{"raw_affiliation_string":"Columbia University, New York, NY, USA","institution_ids":["https://openalex.org/I78577930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5084742757","display_name":"Chris Develder","orcid":"https://orcid.org/0000-0003-2707-4176"},"institutions":[{"id":"https://openalex.org/I32597200","display_name":"Ghent University","ror":"https://ror.org/00cv9y106","country_code":"BE","type":"education","lineage":["https://openalex.org/I32597200"]},{"id":"https://openalex.org/I39327780","display_name":"iMinds","ror":"https://ror.org/03baec336","country_code":"BE","type":"nonprofit","lineage":["https://openalex.org/I39327780"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Chris Develder","raw_affiliation_strings":["Ghent University - iMinds, Ghent, Belgium","Ghent University, iMinds, Ghent, Belgium#TAB#"],"affiliations":[{"raw_affiliation_string":"Ghent University - iMinds, Ghent, Belgium","institution_ids":["https://openalex.org/I39327780","https://openalex.org/I32597200"]},{"raw_affiliation_string":"Ghent University, iMinds, Ghent, Belgium#TAB#","institution_ids":["https://openalex.org/I32597200"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5074134679"],"corresponding_institution_ids":["https://openalex.org/I78577930"],"apc_list":null,"apc_paid":null,"fwci":3.2936,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.93051335,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"153","last_page":"162"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9868000149726868,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8595564365386963},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.7145226001739502},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.6235297918319702},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.6127614378929138},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.551189661026001},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5474454164505005},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5180785655975342},{"id":"https://openalex.org/keywords/tuple","display_name":"Tuple","score":0.49679210782051086},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.47260409593582153},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.4268200993537903},{"id":"https://openalex.org/keywords/plain-text","display_name":"Plain text","score":0.42631202936172485},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.41045185923576355},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.39434924721717834},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.3482162356376648},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.33994555473327637},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.14443251490592957}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8595564365386963},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.7145226001739502},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.6235297918319702},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.6127614378929138},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.551189661026001},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5474454164505005},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5180785655975342},{"id":"https://openalex.org/C118930307","wikidata":"https://www.wikidata.org/wiki/Q600590","display_name":"Tuple","level":2,"score":0.49679210782051086},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.47260409593582153},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.4268200993537903},{"id":"https://openalex.org/C46503548","wikidata":"https://www.wikidata.org/wiki/Q1145976","display_name":"Plain text","level":3,"score":0.42631202936172485},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.41045185923576355},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39434924721717834},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.3482162356376648},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.33994555473327637},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.14443251490592957},{"id":"https://openalex.org/C148730421","wikidata":"https://www.wikidata.org/wiki/Q141090","display_name":"Encryption","level":2,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C118615104","wikidata":"https://www.wikidata.org/wiki/Q121416","display_name":"Discrete mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/2806416.2806581","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2806416.2806581","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 24th ACM International on Conference on Information and Knowledge Management","raw_type":"proceedings-article"},{"id":"pmh:oai:archive.ugent.be:7235609","is_oa":true,"landing_page_url":"http://hdl.handle.net/1854/LU-7235609","pdf_url":"https://biblio.ugent.be/publication/7235609/file/7235611.pdf","source":{"id":"https://openalex.org/S4306400478","display_name":"Ghent University Academic Bibliography (Ghent University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I32597200","host_organization_name":"Ghent University","host_organization_lineage":["https://openalex.org/I32597200"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"ISBN: 978-1-4503-3794-6","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.722.7489","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.722.7489","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.cs.columbia.edu/%7Egravano/Papers/2015/cikm2015.pdf","raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:archive.ugent.be:7235609","is_oa":true,"landing_page_url":"http://hdl.handle.net/1854/LU-7235609","pdf_url":"https://biblio.ugent.be/publication/7235609/file/7235611.pdf","source":{"id":"https://openalex.org/S4306400478","display_name":"Ghent University Academic Bibliography (Ghent University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I32597200","host_organization_name":"Ghent University","host_organization_lineage":["https://openalex.org/I32597200"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"ISBN: 978-1-4503-3794-6","raw_type":"info:eu-repo/semantics/publishedVersion"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3238411697","display_name":null,"funder_award_id":"IIS-08-11038","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8456764768","display_name":null,"funder_award_id":"D11PC20153","funder_id":"https://openalex.org/F4320306116","funder_display_name":"U.S. Department of the Interior"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306116","display_name":"U.S. Department of the Interior","ror":"https://ror.org/03v0pmy70"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2095007791.pdf","grobid_xml":"https://content.openalex.org/works/W2095007791.grobid-xml"},"referenced_works_count":41,"referenced_works":["https://openalex.org/W343945789","https://openalex.org/W1434647451","https://openalex.org/W1493270114","https://openalex.org/W1493490255","https://openalex.org/W1515300998","https://openalex.org/W1533179050","https://openalex.org/W1934019294","https://openalex.org/W1967932772","https://openalex.org/W1986828474","https://openalex.org/W2002682102","https://openalex.org/W2003975698","https://openalex.org/W2009018050","https://openalex.org/W2013970953","https://openalex.org/W2014016001","https://openalex.org/W2016892599","https://openalex.org/W2023450550","https://openalex.org/W2043499927","https://openalex.org/W2059553973","https://openalex.org/W2060088491","https://openalex.org/W2086253379","https://openalex.org/W2094930182","https://openalex.org/W2096891167","https://openalex.org/W2116341550","https://openalex.org/W2125121047","https://openalex.org/W2125969310","https://openalex.org/W2128894810","https://openalex.org/W2129214100","https://openalex.org/W2141099517","https://openalex.org/W2155341658","https://openalex.org/W2163362093","https://openalex.org/W2164135737","https://openalex.org/W2170188121","https://openalex.org/W2294284203","https://openalex.org/W2340309946","https://openalex.org/W2399179762","https://openalex.org/W2963639300","https://openalex.org/W4255459561","https://openalex.org/W6629638141","https://openalex.org/W6631984995","https://openalex.org/W6678605463","https://openalex.org/W6685116542"],"related_works":["https://openalex.org/W4245395944","https://openalex.org/W2143551613","https://openalex.org/W2408506617","https://openalex.org/W1990527953","https://openalex.org/W2030910246","https://openalex.org/W4287775364","https://openalex.org/W36911888","https://openalex.org/W13099415","https://openalex.org/W3029858749","https://openalex.org/W2140970666"],"abstract_inverted_index":{"Information":[0],"extraction":[1,71,94,178],"(IE)":[2],"systems":[3],"discover":[4],"structured":[5,96],"information":[6,204,249],"from":[7],"natural":[8],"language":[9],"text,":[10,47],"to":[11,77,92,104,175],"enable":[12],"much":[13],"richer":[14],"querying":[15],"and":[16,34,163,206,218,227,235],"data":[17],"mining":[18],"than":[19],"possible":[20],"directly":[21],"over":[22,43,213],"the":[23,56,70,79,83,93,112,147,177,191,225,230],"unstructured":[24],"text.":[25],"Unfortunately,":[26],"IE":[27,57,84,113,173,222],"is":[28,48],"generally":[29],"a":[30,65,237],"computationally":[31],"expensive":[32],"process,":[33],"hence":[35,115,127],"improving":[36],"its":[37],"efficiency,":[38],"so":[39],"that":[40,90],"it":[41],"scales":[42],"large":[44],"volumes":[45],"of":[46,49,95,144,187,229,233],"critical":[50],"importance.":[51],"State-of-the-art":[52],"approaches":[53,68],"for":[54,82,111,132,166,171,190,202,219,239,246],"scaling":[55],"process":[58],"focus":[59,138],"on":[60,139,182],"one":[61],"text":[62,107,145],"collection":[63],"at":[64,86],"time.":[66],"These":[67,98],"prioritize":[69,176],"effort":[72,179],"by":[73,180],"learning":[74],"keyword":[75],"queries":[76],"identify":[78],"\"useful\"":[80],"documents":[81,189],"task":[85],"hand,":[87],"namely,":[88],"those":[89],"lead":[91],"\"tuples.\"":[97],"approaches,":[99,234],"however,":[100],"do":[101],"not":[102,122],"attempt":[103],"predict":[105],"which":[106,119],"collections":[108,170,183],"are":[109,155],"useful":[110,125,188],"task---and":[114],"merit":[116],"further":[117],"processing---and":[118],"ones":[120],"will":[121],"contribute":[123],"any":[124],"output---and":[126],"should":[128],"be":[129],"ignored":[130],"altogether,":[131],"efficiency.":[133],"In":[134],"this":[135,241],"paper,":[136],"we":[137,161],"an":[140,172],"especially":[141],"valuable":[142],"family":[143],"sources,":[146],"so-called":[148],"deep":[149,168,215],"web":[150,169,216],"collections,":[151,217],"whose":[152],"(remote)":[153],"contents":[154],"only":[156],"accessible":[157],"via":[158],"querying.":[159],"Specifically,":[160],"introduce":[162],"study":[164,194],"techniques":[165],"ranking":[167],"task,":[174],"focusing":[181],"with":[184],"substantial":[185],"numbers":[186],"task.":[192],"We":[193],"both":[195],"(adaptations":[196],"of)":[197],"state-of-the-art":[198],"resource":[199],"selection":[200],"strategies":[201],"distributed":[203],"retrieval,":[205],"IE-specific":[207],"approaches.":[208],"Our":[209],"extensive":[210],"experimental":[211],"evaluation":[212],"realistic":[214],"several":[220],"different":[221],"tasks,":[223],"shows":[224],"merits":[226],"limitations":[228],"alternative":[231],"families":[232],"provides":[236],"roadmap":[238],"addressing":[240],"critically":[242],"important":[243],"building":[244],"block":[245],"efficient,":[247],"scalable":[248],"extraction.":[250]},"counts_by_year":[{"year":2018,"cited_by_count":1},{"year":2016,"cited_by_count":3}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
