{"id":"https://openalex.org/W1481617452","doi":"https://doi.org/10.1108/prog-12-2011-0059","title":"Extracting bibliographical data for PDF documents with HMM and external resources","display_name":"Extracting bibliographical data for PDF documents with HMM and external resources","publication_year":2014,"publication_date":"2014-07-01","ids":{"openalex":"https://openalex.org/W1481617452","doi":"https://doi.org/10.1108/prog-12-2011-0059","mag":"1481617452"},"language":"en","primary_location":{"id":"doi:10.1108/prog-12-2011-0059","is_oa":false,"landing_page_url":"https://doi.org/10.1108/prog-12-2011-0059","pdf_url":null,"source":{"id":"https://openalex.org/S152623109","display_name":"Program electronic library and information systems","issn_l":"0033-0337","issn":["0033-0337","1758-7301"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319811","host_organization_name":"Emerald Publishing Limited","host_organization_lineage":["https://openalex.org/P4310319811"],"host_organization_lineage_names":["Emerald Publishing Limited"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Program","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5020333990","display_name":"Wen-Feng Hsiao","orcid":null},"institutions":[{"id":"https://openalex.org/I1309796872","display_name":"National Pingtung University","ror":"https://ror.org/03z698x91","country_code":"TW","type":"education","lineage":["https://openalex.org/I1309796872"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Wen-Feng Hsiao","raw_affiliation_strings":["Department of Information Management, National Pingtung Institute of Commerce, Pingtung, Taiwan","(Department of Information Management, National Pingtung Institute of Commerce, Pingtung, Taiwan)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Information Management, National Pingtung Institute of Commerce, Pingtung, Taiwan","institution_ids":["https://openalex.org/I1309796872"]},{"raw_affiliation_string":"(Department of Information Management, National Pingtung Institute of Commerce, Pingtung, Taiwan)","institution_ids":["https://openalex.org/I1309796872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103056510","display_name":"Te-Min Chang","orcid":"https://orcid.org/0000-0002-7373-8175"},"institutions":[{"id":"https://openalex.org/I142974352","display_name":"National Sun Yat-sen University","ror":"https://ror.org/00mjawt10","country_code":"TW","type":"education","lineage":["https://openalex.org/I142974352"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Te-Min Chang","raw_affiliation_strings":["Department of Information Management, National Sun Yat-sen University, Kaohsiung, Taiwan","[Dept. of Information Management, National Sun Yat-Sen University, Kaohsiung, Taiwan]"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Information Management, National Sun Yat-sen University, Kaohsiung, Taiwan","institution_ids":["https://openalex.org/I142974352"]},{"raw_affiliation_string":"[Dept. of Information Management, National Sun Yat-Sen University, Kaohsiung, Taiwan]","institution_ids":["https://openalex.org/I142974352"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017049074","display_name":"Erwin Thomas","orcid":"https://orcid.org/0000-0002-5004-0292"},"institutions":[{"id":"https://openalex.org/I1309796872","display_name":"National Pingtung University","ror":"https://ror.org/03z698x91","country_code":"TW","type":"education","lineage":["https://openalex.org/I1309796872"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Erwin Thomas","raw_affiliation_strings":["Department of Information Management, National Pingtung Institute of Commerce, Pingtung, Taiwan","(Department of Information Management, National Pingtung Institute of Commerce, Pingtung, Taiwan)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Information Management, National Pingtung Institute of Commerce, Pingtung, Taiwan","institution_ids":["https://openalex.org/I1309796872"]},{"raw_affiliation_string":"(Department of Information Management, National Pingtung Institute of Commerce, Pingtung, Taiwan)","institution_ids":["https://openalex.org/I1309796872"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.076829,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":"48","issue":"3","first_page":"293","last_page":"313"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8534585237503052},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.7965942621231079},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.7831223607063293},{"id":"https://openalex.org/keywords/header","display_name":"Header","score":0.7449796795845032},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5797719955444336},{"id":"https://openalex.org/keywords/digital-library","display_name":"Digital library","score":0.5341842174530029},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.49212783575057983},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.49060121178627014},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3577309846878052},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.16249564290046692},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.07448279857635498}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8534585237503052},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.7965942621231079},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.7831223607063293},{"id":"https://openalex.org/C48105269","wikidata":"https://www.wikidata.org/wiki/Q1141160","display_name":"Header","level":2,"score":0.7449796795845032},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5797719955444336},{"id":"https://openalex.org/C513874922","wikidata":"https://www.wikidata.org/wiki/Q212805","display_name":"Digital library","level":3,"score":0.5341842174530029},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.49212783575057983},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.49060121178627014},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3577309846878052},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.16249564290046692},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.07448279857635498},{"id":"https://openalex.org/C124952713","wikidata":"https://www.wikidata.org/wiki/Q8242","display_name":"Literature","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C164913051","wikidata":"https://www.wikidata.org/wiki/Q482","display_name":"Poetry","level":2,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1108/prog-12-2011-0059","is_oa":false,"landing_page_url":"https://doi.org/10.1108/prog-12-2011-0059","pdf_url":null,"source":{"id":"https://openalex.org/S152623109","display_name":"Program electronic library and information systems","issn_l":"0033-0337","issn":["0033-0337","1758-7301"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319811","host_organization_name":"Emerald Publishing Limited","host_organization_lineage":["https://openalex.org/P4310319811"],"host_organization_lineage_names":["Emerald Publishing Limited"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Program","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W1488777300","https://openalex.org/W1534730506","https://openalex.org/W1984130280","https://openalex.org/W1998839545","https://openalex.org/W1999595522","https://openalex.org/W2023160357","https://openalex.org/W2072326204","https://openalex.org/W2079999708","https://openalex.org/W2098162425","https://openalex.org/W2098564132","https://openalex.org/W2118020653","https://openalex.org/W2133004463","https://openalex.org/W2140479099","https://openalex.org/W2142362549","https://openalex.org/W2150644272","https://openalex.org/W2158789913","https://openalex.org/W2162630660","https://openalex.org/W2163737492","https://openalex.org/W2163918411","https://openalex.org/W2168859760","https://openalex.org/W2171114275","https://openalex.org/W4253723135","https://openalex.org/W6649959243"],"related_works":["https://openalex.org/W2171597999","https://openalex.org/W2189136227","https://openalex.org/W1866537546","https://openalex.org/W630850086","https://openalex.org/W3200508093","https://openalex.org/W4372053344","https://openalex.org/W3193978431","https://openalex.org/W2519240373","https://openalex.org/W2379752180","https://openalex.org/W1496096987"],"abstract_inverted_index":{"Purpose":[0],"\u2013":[1,31,94,269,345,380],"The":[2,32,107,124,149,199,248,258,323,381,398,425,466],"purpose":[3],"of":[4,91,103,160,186,193,238,260,319,414,476],"this":[5,454,481],"paper":[6,174,182,229],"is":[7,140,264,275,303,310,384,388,390,401,427,455],"to":[8,17,36,46,55,76,87,99,143,241,316,339,363,423,445],"propose":[9],"an":[10,50],"automatic":[11],"metadata":[12,318],"extraction":[13,216],"and":[14,39,49,59,65,84,117,217,305,312,405,408,447],"retrieval":[15],"system":[16,188,205,244,253,274,282,302,309,350,426],"extract":[18,37,56],"bibliographical":[19,214,325],"information":[20],"from":[21,403,458],"digital":[22,77],"academic":[23,271],"documents":[24],"in":[25,277,367,480],"portable":[26],"document":[27],"formats":[28],"(PDFs).":[29],"Design/methodology/approach":[30],"authors":[33,66,410,440],"use":[34,442],"PDFBox":[35],"text":[38],"font":[40],"size":[41],"information,":[42],"a":[43,133,190,235],"rule-based":[44],"method":[45,479],"identify":[47],"titles,":[48],"Hidden":[51],"Markov":[52],"Model":[53],"(HMM)":[54],"the":[57,62,89,101,104,158,178,181,184,220,228,231,261,273,281,301,308,349,353,409,415,433,437,474],"titles":[58,64],"authors.":[60],"Finally,":[61,227],"extracted":[63,324],"(possibly":[67],"incorrect":[68],"or":[69,296,337],"incomplete)":[70],"are":[71,97],"sent":[72],"as":[73,165,168,335,453],"query":[74],"strings":[75],"libraries":[78],"(e.g.":[79],"ACM,":[80],"IEEE,":[81],"CiteSeerX,":[82],"SDOS,":[83],"Google":[85],"Scholar)":[86],"retrieve":[88],"rest":[90],"metadata.":[92],"Findings":[93],"Four":[95],"experiments":[96],"conducted":[98],"examine":[100],"feasibility":[102,259,475],"proposed":[105,122,155,204,262,478],"system.":[106],"first":[108],"experiment":[109,151,180,233],"compares":[110],"two":[111,278,395],"different":[112,457],"HMM":[113,289,377,382,396,417],"models:":[114],"multi-state":[115,137],"model":[116,120,130,156,171,263,400,438],"one":[118,128],"state":[119,129],"(the":[121,439],"model).":[123],"result":[125,200,249],"shows":[126,152,201,250],"that":[127,153,202,251,391],"can":[131,163,209,313,327,351],"have":[132,462,469],"comparable":[134],"performance":[135,167,185],"with":[136,145,234,245],"model,":[138],"but":[139],"more":[141],"suitable":[142],"deal":[144],"real-world":[146,320,450],"unknown":[147],"states.":[148],"second":[150],"our":[154,187,203,243,252,477],"(without":[157],"aid":[159],"online":[161,207],"query)":[162,208],"achieve":[164],"good":[166,361],"other":[169,293,460],"researcher's":[170],"on":[172,189,213,449],"Cora":[173,285],"header":[175,286],"dataset.":[176],"In":[177],"third":[179],"examines":[183],"small":[191],"dataset":[192,237,444],"43":[194],"real":[195,368],"PDF":[196,321,451],"research":[197],"papers.":[198],"(with":[206],"perform":[210],"pretty":[211],"well":[212],"data":[215,326],"even":[218,429],"outperform":[219,352],"free":[221],"citation":[222,332],"management":[223],"tool":[224],"Zotero":[225,246,256,356],"3.0.":[226],"conducts":[230],"fourth":[232],"larger":[236],"103":[239],"papers":[240],"compare":[242],"4.0.":[247,257],"significantly":[254,456],"outperforms":[255],"thus":[265],"justified.":[266],"Research":[267],"limitations/implications":[268],"For":[270,346],"implication,":[272,348],"unique":[276],"folds:":[279],"first,":[280],"only":[283],"uses":[284],"set":[287],"for":[288],"training,":[290],"without":[291,430],"using":[292],"tagged":[294],"datasets":[295,434],"gazetteers":[297],"resources,":[298],"which":[299],"means":[300],"light":[304],"scalable.":[306],"Second,":[307],"workable":[311,428],"be":[314,329],"applied":[315],"extracting":[317],"files.":[322],"then":[328],"imported":[330],"into":[331],"software":[333],"such":[334],"endnote":[336],"refworks":[338],"increase":[340],"researchers\u2019":[341],"productivity.":[342],"Practical":[343],"implications":[344],"practical":[347],"existing":[354],"tool,":[355],"v4.0.":[357],"This":[358],"provides":[359],"practitioners":[360],"chances":[362],"develop":[364],"similar":[365],"products":[366],"applications;":[369],"though":[370],"it":[371,392],"might":[372],"require":[373],"some":[374],"knowledge":[375],"about":[376,473],"implementation.":[378],"Originality/value":[379],"implementation":[383],"not":[385],"novel.":[386],"What":[387],"innovative":[389],"actually":[393],"combines":[394],"models.":[397],"main":[399],"adapted":[402],"Freitag":[404],"Mccallum":[406],"(1999)":[407],"add":[411],"word":[412],"features":[413],"Nymble":[416],"(Bikel":[418],"et":[419],"al":[420],",":[421],"1997)":[422],"it.":[424],"manually":[431],"tagging":[432],"before":[435],"training":[436],"just":[441],"cora":[443],"train":[446],"test":[448],"papers),":[452],"what":[459],"works":[461],"done":[463],"so":[464],"far.":[465],"experimental":[467],"results":[468],"shown":[470],"sufficient":[471],"evidence":[472],"aspect.":[482]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
