{"id":"https://openalex.org/W7114894080","doi":"https://doi.org/10.48550/arxiv.2512.09874","title":"Benchmarking Document Parsers on Mathematical Formula Extraction from PDFs","display_name":"Benchmarking Document Parsers on Mathematical Formula Extraction from PDFs","publication_year":2025,"publication_date":"2025-12-10","ids":{"openalex":"https://openalex.org/W7114894080","doi":"https://doi.org/10.48550/arxiv.2512.09874"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2512.09874","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.09874","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2512.09874","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Horn, Pius","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Horn, Pius","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Keuper, Janis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Keuper, Janis","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13523","display_name":"Mathematics, Computing, and Information Processing","score":0.9697999954223633,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13523","display_name":"Mathematics, Computing, and Information Processing","score":0.9697999954223633,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.019899999722838402,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.0010999999940395355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.8547999858856201},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.8378999829292297},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6880999803543091},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6186000108718872},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5299999713897705},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5069000124931335},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4235000014305115},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4074000120162964}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.8547999858856201},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.8378999829292297},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8098000288009644},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6880999803543091},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6186000108718872},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5299999713897705},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5099999904632568},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5069000124931335},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4634000062942505},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4235000014305115},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4074000120162964},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.3612000048160553},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3578000068664551},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.35420000553131104},{"id":"https://openalex.org/C2778012447","wikidata":"https://www.wikidata.org/wiki/Q1034415","display_name":"Scope (computer science)","level":2,"score":0.35100001096725464},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.34779998660087585},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.33329999446868896},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.32600000500679016},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.323199987411499},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2978000044822693},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2856999933719635},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.28380000591278076},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.27950000762939453},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.2696000039577484}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2512.09874","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.09874","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2512.09874","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.09874","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.735675573348999,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Correctly":[0],"parsing":[1],"mathematical":[2],"formulas":[3,26,135],"from":[4,18,90],"PDFs":[5,43],"is":[6,62],"critical":[7],"for":[8,65,145,149],"training":[9],"large":[10],"language":[11],"models":[12],"and":[13,55,111,126,152,168],"building":[14],"scientific":[15],"knowledge":[16],"bases":[17],"academic":[19],"literature,":[20],"yet":[21],"existing":[22],"benchmarks":[23],"either":[24],"exclude":[25],"entirely":[27],"or":[28],"lack":[29],"semantically-aware":[30],"evaluation":[31,97,161],"metrics.":[32],"We":[33],"introduce":[34],"a":[35,71,154],"novel":[36],"benchmarking":[37],"framework":[38],"centered":[39],"on":[40,84],"synthetically":[41],"generated":[42],"with":[44,70,102,133],"precise":[45],"LaTeX":[46],"ground":[47],"truth,":[48],"enabling":[49],"systematic":[50],"control":[51],"over":[52],"layout,":[53],"formulas,":[54],"content":[56],"characteristics.":[57],"A":[58],"key":[59],"methodological":[60],"contribution":[61],"pioneering":[63],"LLM-as-a-judge":[64],"semantic":[66],"formula":[67,86,164],"assessment,":[68],"combined":[69],"robust":[72],"two-stage":[73],"matching":[74],"pipeline":[75],"that":[76,95,158],"handles":[77],"parser":[78],"output":[79],"inconsistencies.":[80],"Through":[81],"human":[82,103],"validation":[83],"250":[85],"pairs":[87],"(750":[88],"ratings":[89],"30":[91],"evaluators),":[92],"we":[93],"demonstrate":[94],"LLM-based":[96],"achieves":[98],"substantially":[99],"higher":[100],"correlation":[101],"judgment":[104],"(Pearson":[105],"r=0.78)":[106],"compared":[107],"to":[108],"CDM":[109],"(r=0.34)":[110],"text":[112],"similarity":[113],"(r~0).":[114],"Evaluating":[115],"20+":[116],"contemporary":[117],"PDF":[118,163],"parsers":[119,148],"(including":[120],"specialized":[121],"OCR":[122],"models,":[123,125],"vision-language":[124],"rule-based":[127],"approaches)":[128],"across":[129],"100":[130],"synthetic":[131],"documents":[132],"2,000+":[134],"reveals":[136],"significant":[137],"performance":[138],"disparities.":[139],"Our":[140],"findings":[141],"provide":[142],"crucial":[143],"insights":[144],"practitioners":[146],"selecting":[147],"downstream":[150],"applications":[151],"establish":[153],"robust,":[155],"scalable":[156],"methodology":[157],"enables":[159],"reproducible":[160],"of":[162],"extraction":[165],"quality.":[166],"Code":[167],"benchmark":[169],"data:":[170],"https://github.com/phorn1/pdf-parse-bench":[171]},"counts_by_year":[],"updated_date":"2025-12-12T23:20:42.204495","created_date":"2025-12-12T00:00:00"}
