{"id":"https://openalex.org/W3138074544","doi":"https://doi.org/10.1109/bigdata50022.2020.9377742","title":"WebLens: Towards Web-scale Data Integration, Training the Models","display_name":"WebLens: Towards Web-scale Data Integration, Training the Models","publication_year":2020,"publication_date":"2020-12-10","ids":{"openalex":"https://openalex.org/W3138074544","doi":"https://doi.org/10.1109/bigdata50022.2020.9377742","mag":"3138074544"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata50022.2020.9377742","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata50022.2020.9377742","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2020 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071549682","display_name":"Rituparna Khan","orcid":null},"institutions":[{"id":"https://openalex.org/I103163165","display_name":"Florida State University","ror":"https://ror.org/05g3dte14","country_code":"US","type":"education","lineage":["https://openalex.org/I103163165"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Rituparna Khan","raw_affiliation_strings":["Florida State University"],"affiliations":[{"raw_affiliation_string":"Florida State University","institution_ids":["https://openalex.org/I103163165"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047938200","display_name":"Michael Gubanov","orcid":"https://orcid.org/0000-0002-1354-1215"},"institutions":[{"id":"https://openalex.org/I103163165","display_name":"Florida State University","ror":"https://ror.org/05g3dte14","country_code":"US","type":"education","lineage":["https://openalex.org/I103163165"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Michael Gubanov","raw_affiliation_strings":["Florida State University"],"affiliations":[{"raw_affiliation_string":"Florida State University","institution_ids":["https://openalex.org/I103163165"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5071549682"],"corresponding_institution_ids":["https://openalex.org/I103163165"],"apc_list":null,"apc_paid":null,"fwci":0.1795,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.60446121,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":"17","issue":null,"first_page":"5727","last_page":"5729"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/schema-matching","display_name":"Schema matching","score":0.9022641181945801},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8345187902450562},{"id":"https://openalex.org/keywords/data-integration","display_name":"Data integration","score":0.7546085119247437},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5941174030303955},{"id":"https://openalex.org/keywords/schema","display_name":"Schema (genetic algorithms)","score":0.5616936087608337},{"id":"https://openalex.org/keywords/schema-evolution","display_name":"Schema evolution","score":0.550159215927124},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5373133420944214},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5116243958473206},{"id":"https://openalex.org/keywords/information-schema","display_name":"Information schema","score":0.45919716358184814},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.448234498500824},{"id":"https://openalex.org/keywords/star-schema","display_name":"Star schema","score":0.42548027634620667},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.42126813530921936},{"id":"https://openalex.org/keywords/relational-database","display_name":"Relational database","score":0.40195485949516296},{"id":"https://openalex.org/keywords/database-schema","display_name":"Database schema","score":0.3527931869029999},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.3219740390777588},{"id":"https://openalex.org/keywords/semi-structured-model","display_name":"Semi-structured model","score":0.31388747692108154},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.2985692620277405},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.22575214505195618},{"id":"https://openalex.org/keywords/database-model","display_name":"Database model","score":0.12604761123657227},{"id":"https://openalex.org/keywords/database-design","display_name":"Database design","score":0.09817361831665039}],"concepts":[{"id":"https://openalex.org/C2777327318","wikidata":"https://www.wikidata.org/wiki/Q1408390","display_name":"Schema matching","level":3,"score":0.9022641181945801},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8345187902450562},{"id":"https://openalex.org/C72634772","wikidata":"https://www.wikidata.org/wiki/Q386824","display_name":"Data integration","level":2,"score":0.7546085119247437},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5941174030303955},{"id":"https://openalex.org/C52146309","wikidata":"https://www.wikidata.org/wiki/Q7431116","display_name":"Schema (genetic algorithms)","level":2,"score":0.5616936087608337},{"id":"https://openalex.org/C2780660560","wikidata":"https://www.wikidata.org/wiki/Q3951893","display_name":"Schema evolution","level":4,"score":0.550159215927124},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5373133420944214},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5116243958473206},{"id":"https://openalex.org/C150012506","wikidata":"https://www.wikidata.org/wiki/Q6031185","display_name":"Information schema","level":5,"score":0.45919716358184814},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.448234498500824},{"id":"https://openalex.org/C190703929","wikidata":"https://www.wikidata.org/wiki/Q1331138","display_name":"Star schema","level":4,"score":0.42548027634620667},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.42126813530921936},{"id":"https://openalex.org/C5655090","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database","level":2,"score":0.40195485949516296},{"id":"https://openalex.org/C30775581","wikidata":"https://www.wikidata.org/wiki/Q632285","display_name":"Database schema","level":3,"score":0.3527931869029999},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3219740390777588},{"id":"https://openalex.org/C56310702","wikidata":"https://www.wikidata.org/wiki/Q2269281","display_name":"Semi-structured model","level":4,"score":0.31388747692108154},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.2985692620277405},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.22575214505195618},{"id":"https://openalex.org/C5968703","wikidata":"https://www.wikidata.org/wiki/Q267136","display_name":"Database model","level":3,"score":0.12604761123657227},{"id":"https://openalex.org/C148840519","wikidata":"https://www.wikidata.org/wiki/Q1049878","display_name":"Database design","level":2,"score":0.09817361831665039},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/bigdata50022.2020.9377742","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata50022.2020.9377742","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2020 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W410850256","https://openalex.org/W1480652413","https://openalex.org/W1489949474","https://openalex.org/W1494640588","https://openalex.org/W2008896880","https://openalex.org/W2066232799","https://openalex.org/W2066636486","https://openalex.org/W2067308467","https://openalex.org/W2092486351","https://openalex.org/W2108223890","https://openalex.org/W2108267498","https://openalex.org/W2125822162","https://openalex.org/W2165467455","https://openalex.org/W2400847108","https://openalex.org/W2407983160","https://openalex.org/W2542459869","https://openalex.org/W2613788219","https://openalex.org/W2616271757","https://openalex.org/W3013103751","https://openalex.org/W3029621839","https://openalex.org/W3029701880","https://openalex.org/W3031051334","https://openalex.org/W3031617143","https://openalex.org/W3032215537","https://openalex.org/W3105771849","https://openalex.org/W3106020963","https://openalex.org/W4249874291","https://openalex.org/W6614148910","https://openalex.org/W6628748529","https://openalex.org/W6629296869","https://openalex.org/W6678767931","https://openalex.org/W6713149151","https://openalex.org/W6713731013"],"related_works":["https://openalex.org/W2103472145","https://openalex.org/W166845585","https://openalex.org/W1513459549","https://openalex.org/W1591654213","https://openalex.org/W2351507151","https://openalex.org/W1971380397","https://openalex.org/W2139135093","https://openalex.org/W2584207435","https://openalex.org/W1499427961","https://openalex.org/W896179304"],"abstract_inverted_index":{"Schema":[0],"matching,":[1],"historically,":[2],"is":[3,119,124],"a":[4,23,31,43,110,131,148,186,208],"sub-area":[5],"of":[6,77,81,83,106,189,197,217,229],"Data":[7],"Integration,":[8],"responsible":[9],"for":[10,85,169],"matching":[11,26,33,90,118],"relational":[12,218],"or":[13],"semi-structured":[14],"schemas":[15,48,168],"to":[16,99,136,158,172,176],"facilitate":[17],"further":[18],"data":[19,51,58,150],"integration":[20,52,151],"process.":[21],"In":[22,180],"standard":[24],"schema":[25,56,60,89],"scenario,":[27],"with":[28,75],"two":[29,47],"schemas,":[30],"semi-supervised":[32],"algorithm":[34],"would":[35,134],"generate":[36],"pairwise":[37],"table":[38],"and":[39,65,123,160,193],"attribute":[40],"matches.":[41],"Having":[42],"correct":[44],"mapping":[45],"between":[46],"enables":[49],"many":[50],"scenarios,":[53],"such":[54,116],"as":[55],"integration,":[57],"translation,":[59],"evolution,":[61],"mediated/global":[62],"schema,":[63],"reverse-engineering,":[64],"others":[66],"[18],":[67],"[20],":[68],"[23],":[69],"[29],":[70],"[30].For":[71],"Web":[72,219,230],"scale":[73],"datasets":[74],"millions":[76],"tables":[78,220],"from":[79,224],"hundreds":[80],"thousands":[82],"sources,":[84],"example":[86,196],"WEBTABLES":[87],"[11],":[88],"in":[91,103,203,221],"its":[92,100],"classical":[93],"format":[94],"becomes":[95],"computationally":[96],"infeasible":[97],"due":[98],"quadratic":[101],"complexity":[102],"the":[104,139,190,204],"number":[105],"schemas.":[107],"We":[108],"make":[109],"step":[111],"forward,":[112],"by":[113],"noticing,":[114],"that":[115],"brute-force":[117],"no":[120],"longer":[121],"feasible,":[122],"also":[125],"not":[126],"needed":[127],"at":[128],"scale.":[129],"Instead,":[130],"scalable":[132,149],"solution":[133],"be":[135],"match":[137,161],"only":[138],"semantically":[140,162],"relevant":[141,178],"tables,":[142,164],"which":[143],"are":[144],"much":[145],"less":[146],"numerous.WebLens,":[147],"system,":[152],"first,":[153],"trains":[154],"Deep":[155],"Learning":[156],"models":[157],"find":[159],"similar":[163],"then":[165],"derives":[166],"mediated":[167],"these":[170],"subsets":[171],"enable":[173],"uniform":[174],"access":[175],"all":[177,201],"data.":[179],"this":[181],"paper,":[182,205],"we":[183,206],"focus":[184],"on":[185],"high-level":[187],"description":[188],"entire":[191],"process":[192],"give":[194],"an":[195],"query":[198],"processing.":[199],"For":[200],"experiments":[202],"use":[207],"large-scale":[209],"structured":[210],"dataset":[211],"having":[212],"more":[213,225],"than":[214,226],"15":[215],"million":[216],"English":[222],"coming":[223],"248":[227],"thousand":[228],"sources.":[231]},"counts_by_year":[{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
