{"id":"https://openalex.org/W3137014599","doi":"https://doi.org/10.18653/v1/2021.naacl-main.278","title":"Knowledge Graph Based Synthetic Corpus Generation for Knowledge-Enhanced Language Model Pre-training","display_name":"Knowledge Graph Based Synthetic Corpus Generation for Knowledge-Enhanced Language Model Pre-training","publication_year":2021,"publication_date":"2021-01-01","ids":{"openalex":"https://openalex.org/W3137014599","doi":"https://doi.org/10.18653/v1/2021.naacl-main.278","mag":"3137014599"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2021.naacl-main.278","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2021.naacl-main.278","pdf_url":"https://aclanthology.org/2021.naacl-main.278.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2021.naacl-main.278.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5032390046","display_name":"Oshin Agarwal","orcid":null},"institutions":[{"id":"https://openalex.org/I36788626","display_name":"California University of Pennsylvania","ror":"https://ror.org/01spssf70","country_code":"US","type":"education","lineage":["https://openalex.org/I36788626"]},{"id":"https://openalex.org/I79576946","display_name":"University of Pennsylvania","ror":"https://ror.org/00b30xv10","country_code":"US","type":"education","lineage":["https://openalex.org/I79576946"]},{"id":"https://openalex.org/I922845939","display_name":"Philadelphia University","ror":"https://ror.org/03zzmyz63","country_code":"US","type":"education","lineage":["https://openalex.org/I922845939"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Oshin Agarwal","raw_affiliation_strings":["University of Pennsylvania","University of Pennsylvania, Philadelphia, United States"],"affiliations":[{"raw_affiliation_string":"University of Pennsylvania","institution_ids":["https://openalex.org/I36788626"]},{"raw_affiliation_string":"University of Pennsylvania, Philadelphia, United States","institution_ids":["https://openalex.org/I922845939","https://openalex.org/I79576946"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049042872","display_name":"Heming Ge","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Heming Ge","raw_affiliation_strings":["Google Research"],"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034084374","display_name":"Siamak Shakeri","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Siamak Shakeri","raw_affiliation_strings":["Google Research","Google (United States), Mountain View, United States"],"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Google (United States), Mountain View, United States","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5025234902","display_name":"Rami Al\u2010Rfou","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rami Al-Rfou","raw_affiliation_strings":["Google Research","Google (United States), Mountain View, United States"],"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Google (United States), Mountain View, United States","institution_ids":["https://openalex.org/I1291425158"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5032390046"],"corresponding_institution_ids":["https://openalex.org/I36788626","https://openalex.org/I79576946","https://openalex.org/I922845939"],"apc_list":null,"apc_paid":null,"fwci":0.5644143,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.71405885,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"3554","last_page":"3565"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.9848999977111816,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8228147029876709},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.7093385457992554},{"id":"https://openalex.org/keywords/knowledge-graph","display_name":"Knowledge graph","score":0.7038429379463196},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6225899457931519},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5972590446472168},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.5849573612213135},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5309870839118958},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.5169593691825867},{"id":"https://openalex.org/keywords/open-domain","display_name":"Open domain","score":0.5003077983856201},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.48902878165245056},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4835938811302185},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.47181761264801025},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.46771612763404846},{"id":"https://openalex.org/keywords/domain-knowledge","display_name":"Domain knowledge","score":0.4417325258255005},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.40100041031837463}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8228147029876709},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.7093385457992554},{"id":"https://openalex.org/C2987255567","wikidata":"https://www.wikidata.org/wiki/Q33002955","display_name":"Knowledge graph","level":2,"score":0.7038429379463196},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6225899457931519},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5972590446472168},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.5849573612213135},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5309870839118958},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.5169593691825867},{"id":"https://openalex.org/C2993776861","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Open domain","level":3,"score":0.5003077983856201},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.48902878165245056},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4835938811302185},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.47181761264801025},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.46771612763404846},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.4417325258255005},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.40100041031837463},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.18653/v1/2021.naacl-main.278","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2021.naacl-main.278","pdf_url":"https://aclanthology.org/2021.naacl-main.278.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2010.12688","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2010.12688","pdf_url":"https://arxiv.org/pdf/2010.12688","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:3137014599","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2010.12688.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2010.12688","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2010.12688","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.18653/v1/2021.naacl-main.278","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2021.naacl-main.278","pdf_url":"https://aclanthology.org/2021.naacl-main.278.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.8500000238418579}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3137014599.pdf","grobid_xml":"https://content.openalex.org/works/W3137014599.grobid-xml"},"referenced_works_count":48,"referenced_works":["https://openalex.org/W1852412531","https://openalex.org/W2080133951","https://openalex.org/W2101105183","https://openalex.org/W2127978399","https://openalex.org/W2141608913","https://openalex.org/W2155843872","https://openalex.org/W2164079290","https://openalex.org/W2250770256","https://openalex.org/W2251913848","https://openalex.org/W2739046565","https://openalex.org/W2785611959","https://openalex.org/W2786660442","https://openalex.org/W2889009749","https://openalex.org/W2894882905","https://openalex.org/W2896391192","https://openalex.org/W2902293828","https://openalex.org/W2902887378","https://openalex.org/W2911109671","https://openalex.org/W2912924812","https://openalex.org/W2931198394","https://openalex.org/W2936695845","https://openalex.org/W2950018712","https://openalex.org/W2951048068","https://openalex.org/W2954275542","https://openalex.org/W2962739339","https://openalex.org/W2963091658","https://openalex.org/W2963341956","https://openalex.org/W2963748441","https://openalex.org/W2963906317","https://openalex.org/W2970476646","https://openalex.org/W2970686438","https://openalex.org/W2970986510","https://openalex.org/W2971155257","https://openalex.org/W2971307358","https://openalex.org/W2983787653","https://openalex.org/W3007672467","https://openalex.org/W3015883388","https://openalex.org/W3016309009","https://openalex.org/W3022814719","https://openalex.org/W3026997957","https://openalex.org/W3027879771","https://openalex.org/W3039127676","https://openalex.org/W3039578880","https://openalex.org/W3082274269","https://openalex.org/W3090656107","https://openalex.org/W3102659883","https://openalex.org/W3106255016","https://openalex.org/W3131933120"],"related_works":["https://openalex.org/W3171434230","https://openalex.org/W3015440086","https://openalex.org/W3195474261","https://openalex.org/W438338216","https://openalex.org/W2866375256","https://openalex.org/W2803549996","https://openalex.org/W2495832907","https://openalex.org/W2033051588","https://openalex.org/W948427663","https://openalex.org/W3176652733","https://openalex.org/W3091383835","https://openalex.org/W3210190743","https://openalex.org/W2156954687","https://openalex.org/W2293607556","https://openalex.org/W2612547325","https://openalex.org/W2134804405","https://openalex.org/W2896214266","https://openalex.org/W599211839","https://openalex.org/W3005940716","https://openalex.org/W1572517894"],"abstract_inverted_index":{"Prior":[0],"work":[1],"on":[2,17,134],"Data-To-Text":[3],"Generation,":[4],"the":[5,27,34,69,84,101,112,122,135,144],"task":[6],"of":[7,104,139],"converting":[8],"knowledge":[9,136,146],"graph":[10],"(KG)":[11],"triples":[12],"into":[13,86,95],"natural":[14,63,87],"text,":[15,88],"focused":[16],"domain-specific":[18],"benchmark":[19],"datasets.":[20],"In":[21,66],"this":[22,118],"paper,":[23],"however,":[24],"we":[25],"verbalize":[26],"entire":[28],"English":[29],"Wikidata":[30,54],"KG,":[31],"and":[32,62,108,130,143],"discuss":[33],"unique":[35],"challenges":[36],"associated":[37],"with":[38],"a":[39,49,126],"broad,":[40],"open-domain,":[41],"large-scale":[42],"verbalization.":[43],"We":[44,116],"further":[45,102],"show":[46],"that":[47,72],"verbalizing":[48],"comprehensive,":[50],"encyclopedic":[51],"KG":[52,85],"like":[53],"can":[55],"be":[56,92],"used":[57],"to":[58,68,76,91],"integrate":[59,77],"structured":[60],"KGs":[61],"language":[64,97,114,128],"corpora.":[65],"contrast":[67],"many":[70],"architectures":[71],"have":[73],"been":[74],"developed":[75],"these":[78],"two":[79],"sources,":[80],"our":[81],"approach":[82,119],"converts":[83],"allowing":[89],"it":[90],"seamlessly":[93],"integrated":[94],"existing":[96],"models.":[98],"It":[99],"carries":[100],"advantages":[103],"improved":[105],"factual":[106],"accuracy":[107],"reduced":[109],"toxicity":[110],"in":[111,125],"resulting":[113],"model.":[115],"evaluate":[117],"by":[120],"augmenting":[121],"retrieval":[123,127],"corpus":[124],"model":[129],"showing":[131],"significant":[132],"improvements":[133],"intensive":[137],"tasks":[138],"open":[140],"domain":[141],"QA":[142],"LAMA":[145],"probe.":[147]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
