{"id":"https://openalex.org/W4416037168","doi":"https://doi.org/10.18653/v1/2025.emnlp-industry.184","title":"PARSE: LLM Driven Schema Optimization for Reliable Entity Extraction","display_name":"PARSE: LLM Driven Schema Optimization for Reliable Entity Extraction","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416037168","doi":"https://doi.org/10.18653/v1/2025.emnlp-industry.184"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-industry.184","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-industry.184","pdf_url":"https://aclanthology.org/2025.emnlp-industry.184.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-industry.184.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072543351","display_name":"Anubhav Shrimal","orcid":"https://orcid.org/0000-0002-6269-5771"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Anubhav Shrimal","raw_affiliation_strings":["RBS Tech Sciences , Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"RBS Tech Sciences , Amazon","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009899525","display_name":"Aryan Jain","orcid":null},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Aryan Jain","raw_affiliation_strings":["RBS Tech Sciences , Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"RBS Tech Sciences , Amazon","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Soumyajit Chowdhury","orcid":null},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Soumyajit Chowdhury","raw_affiliation_strings":["RBS Tech Sciences , Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"RBS Tech Sciences , Amazon","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5014940317","display_name":"Promod Yenigalla","orcid":null},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Promod Yenigalla","raw_affiliation_strings":["RBS Tech Sciences , Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"RBS Tech Sciences , Amazon","institution_ids":["https://openalex.org/I1311688040"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I1311688040"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.34543326,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2749","last_page":"2763"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.2808000147342682,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.2808000147342682,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.09040000289678574,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0877000018954277,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/schema","display_name":"Schema (genetic algorithms)","score":0.4544000029563904},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.304500013589859},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.2533000111579895},{"id":"https://openalex.org/keywords/data-extraction","display_name":"Data extraction","score":0.2451999932527542}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7163000106811523},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48019999265670776},{"id":"https://openalex.org/C52146309","wikidata":"https://www.wikidata.org/wiki/Q7431116","display_name":"Schema (genetic algorithms)","level":2,"score":0.4544000029563904},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.35910001397132874},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.304500013589859},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2669000029563904},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.2533000111579895},{"id":"https://openalex.org/C2777466982","wikidata":"https://www.wikidata.org/wiki/Q5227287","display_name":"Data extraction","level":3,"score":0.2451999932527542},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2378000020980835},{"id":"https://openalex.org/C2777327318","wikidata":"https://www.wikidata.org/wiki/Q1408390","display_name":"Schema matching","level":3,"score":0.22540000081062317}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-industry.184","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-industry.184","pdf_url":"https://aclanthology.org/2025.emnlp-industry.184.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-industry.184","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-industry.184","pdf_url":"https://aclanthology.org/2025.emnlp-industry.184.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416037168.pdf","grobid_xml":"https://content.openalex.org/works/W4416037168.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Structured":[0,170],"information":[1],"extraction":[2,29,61,151,190,204],"from":[3],"unstructured":[4],"text":[5],"is":[6],"critical":[7],"for":[8,55,132],"emerging":[9],"Software":[10],"3.0":[11],"systems":[12],"where":[13],"LLM":[14,133],"agents":[15],"autonomously":[16,128],"interact":[17],"with":[18,36,122,152,194],"APIs":[19],"and":[20,65,93,107,116,146,155,161,175,180,212,213],"tools.Recent":[21],"approaches":[22,42],"apply":[23],"large":[24],"language":[25,86],"models":[26],"directly":[27],"to":[28,43,59,104,186],"tasks":[30],"using":[31],"existing":[32],"JSON":[33,49,78,130],"schemas,":[34],"often":[35],"constraint":[37],"decoding":[38],"or":[39,73],"reinforcement":[40],"learning":[41],"ensure":[44],"syntactic":[45],"validity,":[46],"but":[47],"treat":[48],"schemas":[50,70,79,131],"as":[51],"static":[52,154],"contracts":[53,98],"designed":[54],"human":[56],"developers,":[57],"leading":[58],"suboptimal":[60],"performance,":[62],"frequent":[63],"hallucinations,":[64],"unreliable":[66],"agent":[67],"behavior":[68],"when":[69],"contain":[71],"ambiguous":[72],"incomplete":[74],"specifications.We":[75],"recognize":[76],"that":[77,89,99,182],"themselves":[80],"are":[81],"a":[82,119],"form":[83],"of":[84],"natural":[85],"understanding":[87],"contract":[88],"encodes":[90],"rules,":[91],"relationships,":[92],"expectations":[94],"about":[95],"data":[96],"structure":[97],"LLMs":[100],"should":[101],"be":[102],"able":[103],"both":[105],"interpret":[106],"systematically":[108],"improve.Consequently,":[109],"we":[110],"develop":[111],"PARSE":[112,159],"(Parameter":[113],"Automated":[114],"Refinement":[115],"Schema":[117],"Extraction),":[118],"novel":[120],"system":[121],"two":[123],"synergistic":[124],"components:":[125],"ARCHITECT,":[126],"which":[127,148],"optimizes":[129],"consumption":[134],"while":[135,202],"maintaining":[136,214],"backward":[137],"compatibility":[138],"through":[139],"RELAY":[140],"(an":[141],"integrated":[142],"code":[143],"generation":[144],"system),":[145],"SCOPE,":[147],"implements":[149],"reflection-based":[150],"combined":[153,195],"LLM-based":[156],"guardrails.We":[157],"evaluate":[158],"qualitatively":[160],"quantitatively":[162],"on":[163,192],"three":[164],"datasets":[165],"including":[166],"Schema-Guided":[167],"Dialogue":[168],"(SGD),":[169],"Web":[171],"Data":[172],"Extraction":[173],"(SWDE),":[174],"internal":[176],"retail":[177],"conversation":[178],"data,":[179],"find":[181],"it":[183],"achieves":[184],"up":[185],"64.7%":[187],"improvement":[188],"in":[189],"accuracy":[191],"SWDE":[193],"framework":[196],"improvements":[197],"reaching":[198],"10%":[199],"across":[200],"models,":[201],"reducing":[203],"errors":[205],"by":[206],"92%":[207],"within":[208],"the":[209],"first":[210],"retry":[211],"practical":[215],"latency.":[216]},"counts_by_year":[],"updated_date":"2026-06-26T08:34:08.712188","created_date":"2025-11-08T00:00:00"}
