{"id":"https://openalex.org/W4386125402","doi":"https://doi.org/10.14778/3611479.3611511","title":"Self-Training for Label-Efficient Information Extraction from Semi-Structured Web-Pages","display_name":"Self-Training for Label-Efficient Information Extraction from Semi-Structured Web-Pages","publication_year":2023,"publication_date":"2023-07-01","ids":{"openalex":"https://openalex.org/W4386125402","doi":"https://doi.org/10.14778/3611479.3611511"},"language":"en","primary_location":{"id":"doi:10.14778/3611479.3611511","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3611479.3611511","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5014228532","display_name":"Ritesh Sarkhel","orcid":"https://orcid.org/0000-0002-9116-1517"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ritesh Sarkhel","raw_affiliation_strings":["Amazon, Seattle, Washington"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon, Seattle, Washington","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108063115","display_name":"Binxuan Huang","orcid":"https://orcid.org/0000-0003-3571-5738"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Binxuan Huang","raw_affiliation_strings":["Amazon, Seattle, Washington"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon, Seattle, Washington","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056881273","display_name":"Colin Lockard","orcid":null},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Colin Lockard","raw_affiliation_strings":["Amazon, Seattle, Washington"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon, Seattle, Washington","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5004318489","display_name":"Prashant Shiralkar","orcid":null},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Prashant Shiralkar","raw_affiliation_strings":["Amazon, Seattle, Washington"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon, Seattle, Washington","institution_ids":["https://openalex.org/I1311688040"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5014228532"],"corresponding_institution_ids":["https://openalex.org/I1311688040"],"apc_list":null,"apc_paid":null,"fwci":1.3122,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.84910346,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":"16","issue":"11","first_page":"3098","last_page":"3110"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12479","display_name":"Web Application Security Vulnerabilities","score":0.9750000238418579,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9739999771118164,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7938897609710693},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.7039506435394287},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.6206103563308716},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.553403913974762},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.5448340177536011},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.53934645652771},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.5221496224403381},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5050056576728821},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.49772074818611145},{"id":"https://openalex.org/keywords/extraction","display_name":"Extraction (chemistry)","score":0.4823702871799469},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.46415457129478455},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.46258923411369324},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.41895923018455505},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.4115004539489746},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3363206386566162},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.1659795641899109},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.08263731002807617},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.07921311259269714}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7938897609710693},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.7039506435394287},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.6206103563308716},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.553403913974762},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.5448340177536011},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.53934645652771},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.5221496224403381},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5050056576728821},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.49772074818611145},{"id":"https://openalex.org/C4725764","wikidata":"https://www.wikidata.org/wiki/Q844704","display_name":"Extraction (chemistry)","level":2,"score":0.4823702871799469},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.46415457129478455},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.46258923411369324},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.41895923018455505},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4115004539489746},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3363206386566162},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.1659795641899109},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.08263731002807617},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.07921311259269714},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3611479.3611511","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3611479.3611511","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1490688894","https://openalex.org/W2024091454","https://openalex.org/W2097874932","https://openalex.org/W2111316763","https://openalex.org/W2134150392","https://openalex.org/W2141456009","https://openalex.org/W2143309843","https://openalex.org/W2161861392","https://openalex.org/W2163072729","https://openalex.org/W2165698076","https://openalex.org/W2167460663","https://openalex.org/W2340809461","https://openalex.org/W2604259521","https://openalex.org/W2792180729","https://openalex.org/W2912351665","https://openalex.org/W2947298857","https://openalex.org/W2948784110","https://openalex.org/W2971071159","https://openalex.org/W3034300118","https://openalex.org/W3081176230","https://openalex.org/W3121976951","https://openalex.org/W3165374601","https://openalex.org/W4284702754","https://openalex.org/W4385573087"],"related_works":["https://openalex.org/W4365211920","https://openalex.org/W3014948380","https://openalex.org/W4380551139","https://openalex.org/W2280377497","https://openalex.org/W4387506531","https://openalex.org/W3174044702","https://openalex.org/W4238433571","https://openalex.org/W2967848559","https://openalex.org/W4283803360","https://openalex.org/W4317695495"],"abstract_inverted_index":{"Information":[0],"Extraction":[1,39],"(IE)":[2],"from":[3,192],"semi-structured":[4],"web-pages":[5],"is":[6],"a":[7,12,19,86,102,123,131],"long":[8],"studied":[9],"problem.":[10],"Training":[11],"model":[13,125],"for":[14,56,78,207],"this":[15,36,82,98,112],"extraction":[16,83,144],"task":[17],"requires":[18,94],"large":[20,57],"number":[21,201],"of":[22,35,41,49,111,130,202],"human-labeled":[23,76,91,190,203],"samples.":[24],"Prior":[25],"works":[26],"have":[27],"proposed":[28],"transferable":[29,42],"models":[30,43,59,67,185],"to":[31,71,107,140,150,174,212],"improve":[32,108],"the":[33,47,109,128,143,200],"label-efficiency":[34,110],"training":[37,152,162,204],"process.":[38,114],"performance":[40,80,145,209],"however,":[44,93],"depends":[45],"on":[46,74,81,165],"size":[48],"their":[50],"fine-tuning":[51,88,113,133,156],"corpus.":[52],"This":[53],"holds":[54],"true":[55],"language":[58],"(LLM)":[60],"such":[61],"as":[62,64],"GPT-3":[63],"well.":[65],"Generalist":[66],"like":[68],"LLMs":[69],"need":[70],"be":[72],"fine-tuned":[73],"in-domain,":[75],"samples":[77,153,205],"competitive":[79],"task.":[84],"Constructing":[85],"large-scale":[87,132],"corpus":[89,134],"with":[90,135,186],"samples,":[92],"significant":[95],"effort.":[96],"In":[97],"paper,":[99],"we":[100,121,158,182],"develop":[101,122,159],"Label-Efficient":[103],"Self-Training":[104],"Algorithm":[105],"(LEAST)":[106],"Our":[115],"contributions":[116],"are":[117],"two-fold.":[118],"First":[119],",":[120,139],"generative":[124],"that":[126,142,171],"facilitates":[127],"construction":[129],"minimal":[136],"human-effort.":[137],"Second":[138],"ensure":[141],"does":[146],"not":[147],"suffer":[148],"due":[149],"noisy":[151],"in":[154],"our":[155],"corpus,":[157],"an":[160],"uncertainty-aware":[161],"strategy.":[163],"Experiments":[164],"two":[166],"publicly":[167],"available":[168],"datasets":[169],"show":[170],"LEAST":[172],"generalizes":[173],"multiple":[175],"verticals":[176],"and":[177],"backbone":[178],"models.":[179],"Using":[180],"LEAST,":[181],"can":[183],"train":[184],"less":[187],"than":[188],"ten":[189],"pages":[191],"each":[193],"website,":[194],"outperforming":[195],"strong":[196],"baselines":[197],"while":[198],"reducing":[199],"needed":[206],"comparable":[208],"by":[210],"up":[211],"11":[213],"x.":[214]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
