{"id":"https://openalex.org/W4396617203","doi":"https://doi.org/10.1007/s00799-024-00395-4","title":"Building datasets to support information extraction and structure parsing from electronic theses and dissertations","display_name":"Building datasets to support information extraction and structure parsing from electronic theses and dissertations","publication_year":2024,"publication_date":"2024-05-03","ids":{"openalex":"https://openalex.org/W4396617203","doi":"https://doi.org/10.1007/s00799-024-00395-4"},"language":"en","primary_location":{"id":"doi:10.1007/s00799-024-00395-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00799-024-00395-4","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00799-024-00395-4.pdf","source":{"id":"https://openalex.org/S110615584","display_name":"International Journal on Digital Libraries","issn_l":"1432-1300","issn":["1432-1300","1432-5012"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal on Digital Libraries","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s00799-024-00395-4.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5062743697","display_name":"William A. Ingram","orcid":"https://orcid.org/0000-0002-8307-8844"},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"William A. Ingram","raw_affiliation_strings":["University Libraries, Virginia Tech, Blacksburg, VA, USA"],"raw_orcid":"https://orcid.org/0000-0002-8307-8844","affiliations":[{"raw_affiliation_string":"University Libraries, Virginia Tech, Blacksburg, VA, USA","institution_ids":["https://openalex.org/I859038795"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075242841","display_name":"Jian Wu","orcid":"https://orcid.org/0000-0003-0173-4463"},"institutions":[{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jian Wu","raw_affiliation_strings":["Department of Computer Science, Old Dominion University, Norfolk, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Old Dominion University, Norfolk, VA, USA","institution_ids":["https://openalex.org/I81365321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080451138","display_name":"Sampanna Yashwant Kahu","orcid":"https://orcid.org/0000-0002-8522-2926"},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sampanna Yashwant Kahu","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Virginia Tech, Blacksburg, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Virginia Tech, Blacksburg, VA, USA","institution_ids":["https://openalex.org/I859038795"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5096260045","display_name":"Javaid Akbar Manzoor","orcid":null},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Javaid Akbar Manzoor","raw_affiliation_strings":["Department of Computer Science, Virginia Tech, Blacksburg, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Virginia Tech, Blacksburg, VA, USA","institution_ids":["https://openalex.org/I859038795"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085592027","display_name":"Bipasha Banerjee","orcid":"https://orcid.org/0000-0003-4472-1902"},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bipasha Banerjee","raw_affiliation_strings":["Department of Computer Science, Virginia Tech, Blacksburg, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Virginia Tech, Blacksburg, VA, USA","institution_ids":["https://openalex.org/I859038795"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107024104","display_name":"Aman Ahuja","orcid":"https://orcid.org/0009-0002-8491-0193"},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Aman Ahuja","raw_affiliation_strings":["Department of Computer Science, Virginia Tech, Blacksburg, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Virginia Tech, Blacksburg, VA, USA","institution_ids":["https://openalex.org/I859038795"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072339879","display_name":"Muntabir Hasan Choudhury","orcid":"https://orcid.org/0000-0002-9318-8844"},"institutions":[{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Muntabir Hasan Choudhury","raw_affiliation_strings":["Department of Computer Science, Old Dominion University, Norfolk, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Old Dominion University, Norfolk, VA, USA","institution_ids":["https://openalex.org/I81365321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054690853","display_name":"Lamia Salsabil","orcid":"https://orcid.org/0000-0002-6162-2896"},"institutions":[{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lamia Salsabil","raw_affiliation_strings":["Department of Computer Science, Old Dominion University, Norfolk, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Old Dominion University, Norfolk, VA, USA","institution_ids":["https://openalex.org/I81365321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053407666","display_name":"Winston Shields","orcid":null},"institutions":[{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Winston Shields","raw_affiliation_strings":["Department of Computer Science, Old Dominion University, Norfolk, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Old Dominion University, Norfolk, VA, USA","institution_ids":["https://openalex.org/I81365321"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049148461","display_name":"Edward A. Fox","orcid":"https://orcid.org/0000-0003-1447-6870"},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Edward A. Fox","raw_affiliation_strings":["Department of Computer Science, Virginia Tech, Blacksburg, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Virginia Tech, Blacksburg, VA, USA","institution_ids":["https://openalex.org/I859038795"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5062743697"],"corresponding_institution_ids":["https://openalex.org/I859038795"],"apc_list":{"value":2290,"currency":"EUR","value_usd":2890},"apc_paid":{"value":2290,"currency":"EUR","value_usd":2890},"fwci":1.3245,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.82778799,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":"25","issue":"2","first_page":"175","last_page":"196"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9922000169754028,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8721473217010498},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.7915266156196594},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6357179880142212},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.6172263026237488},{"id":"https://openalex.org/keywords/extraction","display_name":"Extraction (chemistry)","score":0.4736557900905609},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.24234935641288757}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8721473217010498},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.7915266156196594},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6357179880142212},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.6172263026237488},{"id":"https://openalex.org/C4725764","wikidata":"https://www.wikidata.org/wiki/Q844704","display_name":"Extraction (chemistry)","level":2,"score":0.4736557900905609},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.24234935641288757},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1007/s00799-024-00395-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00799-024-00395-4","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00799-024-00395-4.pdf","source":{"id":"https://openalex.org/S110615584","display_name":"International Journal on Digital Libraries","issn_l":"1432-1300","issn":["1432-1300","1432-5012"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal on Digital Libraries","raw_type":"journal-article"},{"id":"pmh:oai:vtechworks.lib.vt.edu:10919/139737","is_oa":true,"landing_page_url":"https://hdl.handle.net/10919/139737","pdf_url":"https://vtechworks.lib.vt.edu/bitstreams/9231e80d-809d-41f4-ba07-638024872144/download","source":{"id":"https://openalex.org/S4306400248","display_name":"VTechWorks (Virginia Tech)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I859038795","host_organization_name":"Virginia Tech","host_organization_lineage":["https://openalex.org/I859038795"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Text"},{"id":"pmh:oai:digitalcommons.odu.edu:computerscience_fac_pubs-1325","is_oa":true,"landing_page_url":"https://digitalcommons.odu.edu/computerscience_fac_pubs/320","pdf_url":null,"source":{"id":"https://openalex.org/S4377196314","display_name":"ODU Digital Commons (Old Dominion University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I81365321","host_organization_name":"Old Dominion University","host_organization_lineage":["https://openalex.org/I81365321"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Computer Science Faculty Publications","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1007/s00799-024-00395-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00799-024-00395-4","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00799-024-00395-4.pdf","source":{"id":"https://openalex.org/S110615584","display_name":"International Journal on Digital Libraries","issn_l":"1432-1300","issn":["1432-1300","1432-5012"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal on Digital Libraries","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.7099999785423279,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1471725967","display_name":null,"funder_award_id":"LG-37-19-0078-19","funder_id":"https://openalex.org/F4320306122","funder_display_name":"Institute of Museum and Library Services"}],"funders":[{"id":"https://openalex.org/F4320306122","display_name":"Institute of Museum and Library Services","ror":"https://ror.org/030prv062"},{"id":"https://openalex.org/F4320310598","display_name":"Amazon Web Services","ror":"https://ror.org/04mv4n011"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4396617203.pdf","grobid_xml":"https://content.openalex.org/works/W4396617203.grobid-xml"},"referenced_works_count":75,"referenced_works":["https://openalex.org/W791527587","https://openalex.org/W1508022514","https://openalex.org/W1525757074","https://openalex.org/W1603719052","https://openalex.org/W1861492603","https://openalex.org/W1991869139","https://openalex.org/W1997754546","https://openalex.org/W2001642682","https://openalex.org/W2008433969","https://openalex.org/W2009752265","https://openalex.org/W2027582596","https://openalex.org/W2031071334","https://openalex.org/W2042014073","https://openalex.org/W2048715559","https://openalex.org/W2070812389","https://openalex.org/W2136583886","https://openalex.org/W2179352600","https://openalex.org/W2250539671","https://openalex.org/W2258370040","https://openalex.org/W2416987009","https://openalex.org/W2777092996","https://openalex.org/W2788907956","https://openalex.org/W2795424778","https://openalex.org/W2803318469","https://openalex.org/W2803654587","https://openalex.org/W2894170875","https://openalex.org/W2919502278","https://openalex.org/W2945260553","https://openalex.org/W2949650786","https://openalex.org/W2962835968","https://openalex.org/W2963026768","https://openalex.org/W2963037989","https://openalex.org/W2963341956","https://openalex.org/W2965373594","https://openalex.org/W2970771982","https://openalex.org/W2972380778","https://openalex.org/W2974458888","https://openalex.org/W2981630388","https://openalex.org/W2990230185","https://openalex.org/W2997154779","https://openalex.org/W3000861907","https://openalex.org/W3003711898","https://openalex.org/W3009242205","https://openalex.org/W3015453090","https://openalex.org/W3032538329","https://openalex.org/W3042011474","https://openalex.org/W3045066314","https://openalex.org/W3088290493","https://openalex.org/W3089136920","https://openalex.org/W3098722327","https://openalex.org/W3101186801","https://openalex.org/W3101577715","https://openalex.org/W3104049173","https://openalex.org/W3104953317","https://openalex.org/W3113753692","https://openalex.org/W3121289818","https://openalex.org/W3169837560","https://openalex.org/W3176028514","https://openalex.org/W3176349801","https://openalex.org/W3196831756","https://openalex.org/W4205746567","https://openalex.org/W4253723135","https://openalex.org/W4256351452","https://openalex.org/W4287782095","https://openalex.org/W4287826062","https://openalex.org/W4290662212","https://openalex.org/W4293261616","https://openalex.org/W4302283059","https://openalex.org/W4393844589","https://openalex.org/W6603074440","https://openalex.org/W6610385870","https://openalex.org/W6739901393","https://openalex.org/W6793659597","https://openalex.org/W6893690505","https://openalex.org/W6927031832"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2382290278","https://openalex.org/W4395014643","https://openalex.org/W4391913857","https://openalex.org/W2350741829"],"abstract_inverted_index":{"Abstract":[0],"Despite":[1],"the":[2,27,43,112,117],"millions":[3],"of":[4,104],"electronic":[5],"theses":[6],"and":[7,24,41,69,75,87,100,139,158],"dissertations":[8],"(ETDs)":[9],"publicly":[10],"available":[11],"online,":[12],"digital":[13,33,166],"library":[14,34],"services":[15,35],"for":[16,59,73,168],"ETDs":[17],"have":[18,55],"not":[19],"evolved":[20],"past":[21],"simple":[22],"search":[23],"browse":[25],"at":[26],"metadata":[28],"level.":[29],"We":[30,107,127,149],"need":[31],"better":[32],"that":[36,94],"allow":[37],"users":[38],"to":[39,84,98,135,140,154,163],"discover":[40],"explore":[42],"content":[44],"buried":[45],"in":[46,52,92],"these":[47,67],"long":[48],"documents.":[49,106],"Recent":[50],"advances":[51],"machine":[53,89,146,160],"learning":[54,90,147,161],"shown":[56],"promising":[57],"results":[58],"decomposing":[60],"documents":[61],"into":[62],"their":[63],"constituent":[64],"parts,":[65],"but":[66],"models":[68],"techniques":[70,162],"require":[71],"data":[72,118,123],"training":[74],"evaluation.":[76],"In":[77],"this":[78],"article,":[79],"we":[80,110],"present":[81],"high-quality":[82],"datasets":[83,113,131,157],"train,":[85],"evaluate,":[86,141],"compare":[88],"methods":[91],"tasks":[93],"are":[95],"specifically":[96],"suited":[97],"identify":[99],"extract":[101],"key":[102],"elements":[103],"ETD":[105],"explain":[108],"how":[109,129],"construct":[111],"by":[114,120],"manual":[115],"labeling":[116],"or":[119,143],"deriving":[121],"labeled":[122],"through":[124],"synthetic":[125],"processes.":[126],"demonstrate":[128],"our":[130,151],"can":[132],"be":[133],"used":[134],"develop":[136],"downstream":[137],"applications":[138],"retrain,":[142],"fine-tune":[144],"pre-trained":[145],"models.":[148],"describe":[150],"ongoing":[152],"work":[153],"compile":[155],"benchmark":[156],"exploit":[159],"build":[164],"intelligent":[165],"libraries":[167],"ETDs.":[169]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-12T06:13:28.667946","created_date":"2025-10-10T00:00:00"}
