{"id":"https://openalex.org/W4413980055","doi":"https://doi.org/10.14778/3749646.3749647","title":"Doctopus: Budget-Aware Structural Table Extraction from Unstructured Documents","display_name":"Doctopus: Budget-Aware Structural Table Extraction from Unstructured Documents","publication_year":2025,"publication_date":"2025-07-01","ids":{"openalex":"https://openalex.org/W4413980055","doi":"https://doi.org/10.14778/3749646.3749647"},"language":"en","primary_location":{"id":"doi:10.14778/3749646.3749647","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3749646.3749647","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054869135","display_name":"Chengliang Chai","orcid":"https://orcid.org/0009-0003-5386-1330"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chengliang Chai","raw_affiliation_strings":["Beijing Institute of Technology, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100352041","display_name":"Jiajun Li","orcid":"https://orcid.org/0000-0002-7208-9345"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiajun Li","raw_affiliation_strings":["Beijing Institute of Technology, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044971195","display_name":"Yuhao Deng","orcid":"https://orcid.org/0009-0002-4473-4527"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuhao Deng","raw_affiliation_strings":["Beijing Institute of Technology, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074946289","display_name":"Yang Zhong","orcid":"https://orcid.org/0009-0004-6933-8736"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuanhao Zhong","raw_affiliation_strings":["Beijing Institute of Technology, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014346487","display_name":"Ye Yuan","orcid":"https://orcid.org/0000-0002-0247-9866"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ye Yuan","raw_affiliation_strings":["Beijing Institute of Technology, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103052244","display_name":"Guoren Wang","orcid":"https://orcid.org/0000-0002-8411-2127"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guoren Wang","raw_affiliation_strings":["Beijing Institute of Technology, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049926126","display_name":"Lei Cao","orcid":"https://orcid.org/0000-0001-9909-8607"},"institutions":[{"id":"https://openalex.org/I138006243","display_name":"University of Arizona","ror":"https://ror.org/03m2x1q45","country_code":"US","type":"education","lineage":["https://openalex.org/I138006243"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lei Cao","raw_affiliation_strings":["University of Arizona, United States"],"affiliations":[{"raw_affiliation_string":"University of Arizona, United States","institution_ids":["https://openalex.org/I138006243"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5054869135"],"corresponding_institution_ids":["https://openalex.org/I125839683"],"apc_list":null,"apc_paid":null,"fwci":1.3088,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.84349926,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"18","issue":"11","first_page":"3695","last_page":"3707"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9961000084877014,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9961000084877014,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6680599451065063},{"id":"https://openalex.org/keywords/table","display_name":"Table (database)","score":0.612919807434082},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5797837972640991},{"id":"https://openalex.org/keywords/table-of-contents","display_name":"Table of contents","score":0.5614120364189148},{"id":"https://openalex.org/keywords/unstructured-data","display_name":"Unstructured data","score":0.44761449098587036},{"id":"https://openalex.org/keywords/extraction","display_name":"Extraction (chemistry)","score":0.4447920322418213},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.32285159826278687},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.3118804693222046},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.2604403495788574},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.24964389204978943},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.06575620174407959},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.05782094597816467},{"id":"https://openalex.org/keywords/chromatography","display_name":"Chromatography","score":0.04768219590187073}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6680599451065063},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.612919807434082},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5797837972640991},{"id":"https://openalex.org/C68476402","wikidata":"https://www.wikidata.org/wiki/Q1456936","display_name":"Table of contents","level":2,"score":0.5614120364189148},{"id":"https://openalex.org/C2781252014","wikidata":"https://www.wikidata.org/wiki/Q1141900","display_name":"Unstructured data","level":3,"score":0.44761449098587036},{"id":"https://openalex.org/C4725764","wikidata":"https://www.wikidata.org/wiki/Q844704","display_name":"Extraction (chemistry)","level":2,"score":0.4447920322418213},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32285159826278687},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.3118804693222046},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.2604403495788574},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.24964389204978943},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.06575620174407959},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.05782094597816467},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.04768219590187073}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3749646.3749647","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3749646.3749647","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W1498990157","https://openalex.org/W2077815765","https://openalex.org/W2094625154","https://openalex.org/W2483327705","https://openalex.org/W2740554209","https://openalex.org/W2805173585","https://openalex.org/W3014705052","https://openalex.org/W3103433205","https://openalex.org/W3123375411","https://openalex.org/W3148437589","https://openalex.org/W3198523333","https://openalex.org/W3213739863","https://openalex.org/W4301243929","https://openalex.org/W4310923309","https://openalex.org/W4376122773","https://openalex.org/W4385571189","https://openalex.org/W4387427491","https://openalex.org/W4389518671","https://openalex.org/W4389519207","https://openalex.org/W4389520355","https://openalex.org/W4389539730","https://openalex.org/W4391418506","https://openalex.org/W4398859518","https://openalex.org/W4399174722","https://openalex.org/W4401857375","https://openalex.org/W4403754277","https://openalex.org/W4411630013"],"related_works":["https://openalex.org/W3203889067","https://openalex.org/W3184725726","https://openalex.org/W2378793138","https://openalex.org/W2759357633","https://openalex.org/W4394360958","https://openalex.org/W2948670949","https://openalex.org/W4288047943","https://openalex.org/W2591076968","https://openalex.org/W2761062406","https://openalex.org/W4256318226"],"abstract_inverted_index":{"To":[0,96],"fulfill":[1],"the":[2,134,151,158,169,177,208,219,224],"potential":[3],"great":[4],"value":[5],"of":[6,160],"unstructured":[7,114],"documents,":[8],"it":[9,155],"is":[10],"critical":[11],"to":[12,51,75,128,140],"extract":[13],"structural":[14],"data":[15],"(e.g.,":[16],"attributes)":[17],"from":[18,113],"them,":[19],"which":[20],"can":[21,40,217],"benefit":[22],"various":[23,194],"applications":[24],"such":[25,34],"as":[26,35],"analytical":[27],"SQL":[28],"queries":[29],"and":[30,101,143,171,196],"decision-making.":[31],"Multiple":[32],"strategies,":[33],"pre-trained":[36],"language":[37,70],"models":[38,71],"(PLMs),":[39],"be":[41,76],"employed":[42],"for":[43,92,109,163],"this":[44],"task.":[45],"However,":[46],"these":[47],"methods":[48],"often":[49],"struggle":[50],"achieve":[52,129],"high-quality":[53],"results,":[54],"particularly":[55],"when":[56],"dealing":[57],"with":[58,116,125,193,213],"attribute":[59,111],"extraction":[60,112],"that":[61,211],"requires":[62],"intricate":[63],"reasoning":[64],"or":[65],"semantic":[66],"comprehension.":[67],"Recently,":[68],"large":[69],"(LLMs)":[72],"have":[73,184],"proven":[74],"effective":[77],"in":[78],"extracting":[79],"attributes":[80],"but":[81],"incur":[82],"substantial":[83],"costs":[84],"caused":[85],"by":[86,221],"token":[87],"consumption,":[88],"making":[89],"them":[90],"impractical":[91],"large-scale":[93],"document":[94,191],"set.":[95],"best":[97],"trade":[98],"off":[99],"quality":[100,159,220],"cost,":[102],"we":[103],"present":[104],"Doctopus,":[105],"a":[106,117,130,186],"system":[107,135],"designed":[108],"accurate":[110],"documents":[115],"user-specified":[118],"cost":[119,170,226],"constraint.":[120,227],"Overall,":[121],"Doctopus":[122,174,216],"combines":[123],"LLMs":[124],"non-LLM":[126],"strategies":[127,162,179],"good":[131],"tradeoff.":[132],"First,":[133],"employs":[136],"an":[137],"index-based":[138],"approach":[139],"efficiently":[141],"identify":[142],"process":[144],"only":[145],"relevant":[146],"text":[147],"chunks,":[148],"thereby":[149],"reducing":[150],"LLM":[152],"cost.":[153],"Afterwards,":[154],"further":[156],"estimates":[157],"multiple":[161],"each":[164],"attribute.":[165],"Finally,":[166],"based":[167],"on":[168,207],"estimated":[172],"quality,":[173],"dynamically":[175],"selects":[176],"optimal":[178],"through":[180],"budget-aware":[181],"optimization.":[182],"We":[183],"built":[185],"comprehensive":[187],"benchmark":[188,209],"including":[189],"4":[190],"sets":[192],"characteristics":[195],"manually":[197],"labeled":[198],"ground":[199],"truth":[200],"using":[201],"1000":[202],"human":[203],"hours.":[204],"Extensive":[205],"experiments":[206],"show":[210],"compared":[212],"state-of-the-art":[214],"baselines,":[215],"improve":[218],"11%":[222],"given":[223],"same":[225]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-25T14:56:36.534964","created_date":"2025-10-10T00:00:00"}
