{"id":"https://openalex.org/W4416034227","doi":"https://doi.org/10.18653/v1/2025.findings-emnlp.1238","title":"Enhancing Domain-Specific Encoder Models with LLM-Generated Data: How to Leverage Ontologies, and How to Do Without Them","display_name":"Enhancing Domain-Specific Encoder Models with LLM-Generated Data: How to Leverage Ontologies, and How to Do Without Them","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416034227","doi":"https://doi.org/10.18653/v1/2025.findings-emnlp.1238"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.findings-emnlp.1238","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.1238","pdf_url":"https://aclanthology.org/2025.findings-emnlp.1238.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.findings-emnlp.1238.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5092596222","display_name":"Marc Brinner","orcid":"https://orcid.org/0000-0002-5121-2922"},"institutions":[{"id":"https://openalex.org/I2802799214","display_name":"Association for Computational Linguistics","ror":"https://ror.org/019sw1443","country_code":"US","type":"other","lineage":["https://openalex.org/I2802799214"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Marc Felix Brinner","raw_affiliation_strings":["Computational Linguistics Department of Linguistics Bielefeld University , Germany"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Computational Linguistics Department of Linguistics Bielefeld University , Germany","institution_ids":["https://openalex.org/I2802799214"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083417294","display_name":"Tarek Al Mustafa","orcid":"https://orcid.org/0000-0001-7793-4483"},"institutions":[{"id":"https://openalex.org/I76198965","display_name":"Friedrich Schiller University Jena","ror":"https://ror.org/05qpz1x62","country_code":"DE","type":"education","lineage":["https://openalex.org/I76198965"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Tarek Al Mustafa","raw_affiliation_strings":["Heinz Nixdorf Chair for Distibuted Information Systems Institute of Computer Science Friedrich Schiller University Jena , Germany"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Heinz Nixdorf Chair for Distibuted Information Systems Institute of Computer Science Friedrich Schiller University Jena , Germany","institution_ids":["https://openalex.org/I76198965"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5078051602","display_name":"Sina Zarrie\u00df","orcid":"https://orcid.org/0000-0002-1384-1218"},"institutions":[{"id":"https://openalex.org/I2802799214","display_name":"Association for Computational Linguistics","ror":"https://ror.org/019sw1443","country_code":"US","type":"other","lineage":["https://openalex.org/I2802799214"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sina Zarrie\u00df","raw_affiliation_strings":["Computational Linguistics Department of Linguistics Bielefeld University , Germany"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Computational Linguistics Department of Linguistics Bielefeld University , Germany","institution_ids":["https://openalex.org/I2802799214"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.7312,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.89268112,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"22740","last_page":"22754"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.1940000057220459,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.1940000057220459,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.14180000126361847,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.09780000150203705,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7739999890327454},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7660999894142151},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5896999835968018},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5681999921798706},{"id":"https://openalex.org/keywords/labeled-data","display_name":"Labeled data","score":0.46709999442100525},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4302999973297119},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4244999885559082},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.42399999499320984}],"concepts":[{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7739999890327454},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7660999894142151},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7615000009536743},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5896999835968018},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5681999921798706},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5134999752044678},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5047000050544739},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.46709999442100525},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4302999973297119},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4244999885559082},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.42399999499320984},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.4088999927043915},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.38420000672340393},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.36890000104904175},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.36660000681877136},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.3458000123500824},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.3327000141143799},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3264999985694885},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3073999881744385},{"id":"https://openalex.org/C105002631","wikidata":"https://www.wikidata.org/wiki/Q4833645","display_name":"Subject-matter expert","level":3,"score":0.27559998631477356},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.26570001244544983},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.25440001487731934}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.18653/v1/2025.findings-emnlp.1238","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.1238","pdf_url":"https://aclanthology.org/2025.findings-emnlp.1238.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2503.22006","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.22006","pdf_url":"https://arxiv.org/pdf/2503.22006","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.findings-emnlp.1238","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.1238","pdf_url":"https://aclanthology.org/2025.findings-emnlp.1238.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416034227.pdf","grobid_xml":"https://content.openalex.org/works/W4416034227.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"investigate":[1,84],"the":[2,22,46,58,85,89],"use":[3],"of":[4,10,25,60,87,109,134,147],"LLM-generated":[5,42],"data":[6,43],"for":[7,54,69,143,155],"continual":[8],"pretraining":[9,45,168],"transformer":[11],"encoder":[12,47,149],"models":[13,150],"in":[14,73,138,157],"specialized":[15],"domains":[16,93],"with":[17,41,101],"limited":[18],"training":[19],"data,":[20],"using":[21,129],"scientific":[23,110,135],"domain":[24],"invasion":[26,74],"biology":[27],"as":[28,49],"a":[29,65,106,131,139],"case":[30],"study.To":[31],"this":[32,61,123],"end,":[33],"we":[34,63,83],"leverage":[35],"domain-specific":[36,145],"ontologies":[37,96],"by":[38,97],"enriching":[39],"them":[40],"and":[44,112,160],"model":[48,53,71],"an":[50],"ontologyinformed":[51],"embedding":[52],"concept":[55],"definitions.To":[56],"evaluate":[57],"effectiveness":[59],"method,":[62],"compile":[64],"benchmark":[66],"specifically":[67],"designed":[68],"assessing":[70],"performance":[72,128,162],"biology.After":[75],"demonstrating":[76],"substantial":[77],"improvements":[78],"over":[79],"standard":[80],"MLM":[81],"pretraining,":[82],"feasibility":[86],"applying":[88],"proposed":[90],"approach":[91,125],"to":[92,164],"without":[94],"comprehensive":[95],"substituting":[98],"ontological":[99],"concepts":[100,102,116],"automatically":[103],"extracted":[104],"from":[105],"small":[107,132,148],"corpus":[108],"abstracts":[111],"establishing":[113],"relationships":[114],"between":[115],"through":[117],"distributional":[118],"statistics.Our":[119],"results":[120],"demonstrate":[121],"that":[122,151],"automated":[124,141],"achieves":[126,161],"comparable":[127,163],"only":[130],"set":[133],"abstracts,":[136],"resulting":[137],"fully":[140],"pipeline":[142],"enhancing":[144],"understanding":[146],"is":[152],"especially":[153],"suited":[154],"application":[156],"low-resource":[158],"settings":[159],"masked":[165],"language":[166],"modeling":[167],"on":[169],"much":[170],"larger":[171],"datasets.":[172],"Hypothesis":[173,175],"Clf":[174,177],"SpanImpact":[176],"Impact":[178]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-14T07:44:22.658603","created_date":"2025-11-08T00:00:00"}
