{"id":"https://openalex.org/W7154302418","doi":"https://doi.org/10.48550/arxiv.2604.11042","title":"Improving Layout Representation Learning Across Inconsistently Annotated Datasets via Agentic Harmonization","display_name":"Improving Layout Representation Learning Across Inconsistently Annotated Datasets via Agentic Harmonization","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154302418","doi":"https://doi.org/10.48550/arxiv.2604.11042"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11042","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11042","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11042","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133557917","display_name":"Renyu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Renyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133589650","display_name":"Vladimir Kirilenko","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kirilenko, Vladimir","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133580553","display_name":"Yao You","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"You, Yao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133612456","display_name":"Crag Wolfe","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wolfe, Crag","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5133557917"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.29499998688697815,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.29499998688697815,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.2538999915122986,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.13910000026226044,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.6535000205039978},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5902000069618225},{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.5202000141143799},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4855000078678131},{"id":"https://openalex.org/keywords/bounding-overwatch","display_name":"Bounding overwatch","score":0.4794999957084656},{"id":"https://openalex.org/keywords/minimum-bounding-box","display_name":"Minimum bounding box","score":0.46650001406669617},{"id":"https://openalex.org/keywords/table","display_name":"Table (database)","score":0.4535999894142151},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4472000002861023},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.429500013589859}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7549999952316284},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.6535000205039978},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5902000069618225},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.5202000141143799},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4875999987125397},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4855000078678131},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.4794999957084656},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.46650001406669617},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.4535999894142151},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4480000138282776},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4472000002861023},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.429500013589859},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.4041000008583069},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.3935000002384186},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3903000056743622},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3788999915122986},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3456000089645386},{"id":"https://openalex.org/C199579030","wikidata":"https://www.wikidata.org/wiki/Q2851778","display_name":"Automatic image annotation","level":4,"score":0.3411000072956085},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.33899998664855957},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3285999894142151},{"id":"https://openalex.org/C2779962950","wikidata":"https://www.wikidata.org/wiki/Q5659376","display_name":"Harmonization","level":2,"score":0.32280001044273376},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.29319998621940613},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29179999232292175},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2883000075817108},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C2776035091","wikidata":"https://www.wikidata.org/wiki/Q7928819","display_name":"Viewpoints","level":2,"score":0.26019999384880066},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.25839999318122864},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2565999925136566}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11042","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11042","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11042","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11042","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Fine-tuning":[0],"object":[1],"detection":[2,53,124],"(OD)":[3],"models":[4],"on":[5,50,76],"combined":[6],"datasets":[7,12],"assumes":[8],"annotation":[9,60,162],"compatibility,":[10],"yet":[11],"often":[13],"encode":[14],"conflicting":[15],"spatial":[16,122],"definitions":[17],"for":[18],"semantically":[19],"equivalent":[20],"categories.":[21],"We":[22,48],"propose":[23],"an":[24],"agentic":[25],"label":[26],"harmonization":[27,112],"workflow":[28],"that":[29,150,161,170],"uses":[30],"a":[31,55,72],"vision-language":[32],"model":[33],"to":[34,95,98,129,134,144],"reconcile":[35],"both":[36],"category":[37,105],"semantics":[38],"and":[39,103,121,136,156,169],"bounding":[40,138],"box":[41,139],"granularity":[42],"across":[43,64,116],"heterogeneous":[44],"sources":[45],"before":[46,173],"training.":[47],"evaluate":[49],"document":[51,84],"layout":[52],"as":[54],"challenging":[56],"case":[57],"study,":[58],"where":[59],"standards":[61],"vary":[62],"widely":[63],"corpora.":[65],"Without":[66],"harmonization,":[67],"na\u00efve":[68],"mixed-dataset":[69],"fine-tuning":[70],"degrades":[71],"pretrained":[73],"RT-DETRv2":[74],"detector:":[75],"SCORE-Bench,":[77],"which":[78],"measures":[79],"how":[80],"accurately":[81],"the":[82,165],"full":[83],"conversion":[85],"pipeline":[86],"reproduces":[87],"ground-truth":[88],"structure,":[89,120],"table":[90,119,131],"TEDS":[91,132],"drops":[92,141],"from":[93,127,142],"0.800":[94],"0.750.":[96],"Applied":[97],"two":[99],"corpora":[100],"whose":[101],"16":[102],"10":[104],"taxonomies":[106],"share":[107],"only":[108],"8":[109],"direct":[110],"correspondences,":[111],"yields":[113],"consistent":[114],"gains":[115],"content":[117],"fidelity,":[118],"consistency:":[123],"F-score":[125],"improves":[126,133],"0.860":[128],"0.883,":[130],"0.814,":[135],"mean":[137],"overlap":[140],"0.043":[143],"0.016.":[145],"Representation":[146],"analysis":[147],"further":[148],"shows":[149],"harmonized":[151],"training":[152,174],"produces":[153],"more":[154],"compact":[155],"separable":[157],"post-decoder":[158],"embeddings,":[159],"confirming":[160],"inconsistency":[163],"distorts":[164],"learned":[166],"feature":[167],"space":[168],"resolving":[171],"it":[172],"restores":[175],"representation":[176],"structure.":[177]},"counts_by_year":[],"updated_date":"2026-04-15T06:04:33.058270","created_date":"2026-04-15T00:00:00"}
