{"id":"https://openalex.org/W4412877044","doi":"https://doi.org/10.1145/3711896.3737432","title":"ScIRGen: Synthesize Realistic and Large-Scale RAG Dataset for Scientific Research","display_name":"ScIRGen: Synthesize Realistic and Large-Scale RAG Dataset for Scientific Research","publication_year":2025,"publication_date":"2025-08-03","ids":{"openalex":"https://openalex.org/W4412877044","doi":"https://doi.org/10.1145/3711896.3737432"},"language":"en","primary_location":{"id":"doi:10.1145/3711896.3737432","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3711896.3737432","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002998698","display_name":"Junyong Lin","orcid":"https://orcid.org/0009-0007-2847-4071"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Junyong Lin","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015785636","display_name":"Lu Dai","orcid":"https://orcid.org/0000-0001-9795-8968"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]},{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Lu Dai","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China and The Hong Kong University of Science and Technology, Hong Kong SAR, Hong Kong"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China and The Hong Kong University of Science and Technology, Hong Kong SAR, Hong Kong","institution_ids":["https://openalex.org/I200769079","https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119175131","display_name":"Ruiqian Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ruiqian Han","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028971108","display_name":"Yijie Sui","orcid":"https://orcid.org/0009-0002-4344-7745"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210141476","display_name":"Institute of Tibetan Plateau Research","ror":"https://ror.org/03zn6c508","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210141476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yijie Sui","raw_affiliation_strings":["Institute of Tibetan Plateau Research, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Tibetan Plateau Research, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210141476","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042898580","display_name":"R. Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruilin Wang","raw_affiliation_strings":["Lanzhou University, Lanzhou, Gansu, China"],"affiliations":[{"raw_affiliation_string":"Lanzhou University, Lanzhou, Gansu, China","institution_ids":["https://openalex.org/I76214153"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081058079","display_name":"Xingliang Sun","orcid":"https://orcid.org/0000-0002-3756-2758"},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingliang Sun","raw_affiliation_strings":["Lanzhou University, Lanzhou, Gansu, China"],"affiliations":[{"raw_affiliation_string":"Lanzhou University, Lanzhou, Gansu, China","institution_ids":["https://openalex.org/I76214153"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100624447","display_name":"Qinglin Wu","orcid":"https://orcid.org/0009-0003-0514-6287"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210141476","display_name":"Institute of Tibetan Plateau Research","ror":"https://ror.org/03zn6c508","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210141476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qinglin Wu","raw_affiliation_strings":["Institute of Tibetan Plateau Research, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Tibetan Plateau Research, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210141476","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033295969","display_name":"Min Feng","orcid":"https://orcid.org/0000-0001-7456-7534"},"institutions":[{"id":"https://openalex.org/I4210141476","display_name":"Institute of Tibetan Plateau Research","ror":"https://ror.org/03zn6c508","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210141476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Min Feng","raw_affiliation_strings":["Institute of Tibetan Plateau Research, Chinese Academy of Sciences, Beijing, China and College of Resources and Environment, University of Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Tibetan Plateau Research, Chinese Academy of Sciences, Beijing, China and College of Resources and Environment, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210141476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100458897","display_name":"Hao Liu","orcid":"https://orcid.org/0000-0003-4271-1567"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]},{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Hao Liu","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China and The Hong Kong University of Science and Technology, Hong Kong SAR, Hong Kong"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China and The Hong Kong University of Science and Technology, Hong Kong SAR, Hong Kong","institution_ids":["https://openalex.org/I200769079","https://openalex.org/I889458895"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101862104","display_name":"Hui Xiong","orcid":"https://orcid.org/0000-0001-6016-6465"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]},{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Hui Xiong","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China and The Hong Kong University of Science and Technology, Hong Kong SAR, Hong Kong"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China and The Hong Kong University of Science and Technology, Hong Kong SAR, Hong Kong","institution_ids":["https://openalex.org/I200769079","https://openalex.org/I889458895"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5002998698"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.09527969,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"5619","last_page":"5630"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.9908999800682068,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.9908999800682068,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9761999845504761,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9635000228881836,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6566662192344666},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.6472880244255066},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.38360750675201416},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.0803895890712738},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.07886180281639099}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6566662192344666},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.6472880244255066},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.38360750675201416},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0803895890712738},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.07886180281639099}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3711896.3737432","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3711896.3737432","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W2146936057","https://openalex.org/W2889787757","https://openalex.org/W2912924812","https://openalex.org/W2945260553","https://openalex.org/W2949849869","https://openalex.org/W2970771982","https://openalex.org/W3027879771","https://openalex.org/W3035324702","https://openalex.org/W3101007570","https://openalex.org/W3156836409","https://openalex.org/W3190126809","https://openalex.org/W3208454977","https://openalex.org/W4377864264","https://openalex.org/W4385571873","https://openalex.org/W4385572634","https://openalex.org/W4388697414","https://openalex.org/W4392384758","https://openalex.org/W4402671689","https://openalex.org/W4404782381","https://openalex.org/W4404782900","https://openalex.org/W6853620503"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Scientific":[0],"researchers":[1],"need":[2],"intensive":[3],"information":[4,16,78,107,216],"about":[5],"datasets":[6,19,41,99],"to":[7,87,114,130,142,212],"effectively":[8],"evaluate":[9],"and":[10,13,38,84,100,188],"develop":[11],"theories":[12],"methodologies.":[14],"The":[15],"needs":[17,79,217],"regarding":[18],"are":[20],"implicitly":[21],"embedded":[22],"in":[23,31,168],"particular":[24],"research":[25,55],"tasks,":[26],"rather":[27],"than":[28],"explicitly":[29],"expressed":[30],"search":[32],"queries.":[33],"However,":[34],"existing":[35],"scientific":[36,69,91,220],"retrieval":[37,72,189],"question-answering":[39,187],"(QA)":[40],"typically":[42],"address":[43],"straightforward":[44],"questions,":[45],"which":[46,154],"do":[47],"not":[48],"align":[49],"with":[50,96,158],"the":[51,77,116,132,149,169,172,182,206,214,219],"distribution":[52],"of":[53,80,134,152,161,171,208,218],"real-world":[54],"inquiries.":[56],"To":[57],"bridge":[58],"this":[59],"gap,":[60],"we":[61,103],"developed":[62],"ScIRGen,":[63],"a":[64,89,105,122,140],"dataset":[65,95,117,184],"generation":[66,93,124],"framework":[67,125],"for":[68,185],"QA":[70,174],"&":[71],"that":[73,110,193],"more":[74,209],"accurately":[75],"reflects":[76],"professional":[81],"science":[82],"researchers,":[83],"uses":[85],"it":[86],"create":[88],"large-scale":[90],"retrieval-augmented":[92],"(RAG)":[94],"realistic":[97],"queries,":[98],"papers.":[101],"Technically,":[102],"designed":[104],"dataset-oriented":[106],"extraction":[108],"method":[109,141],"leverages":[111],"academic":[112],"papers":[113],"augment":[115],"representation.":[118],"We":[119,137,177],"then":[120],"proposed":[121],"question":[123],"by":[126],"employing":[127],"cognitive":[128],"taxonomy":[129],"ensure":[131],"quality":[133],"synthesized":[135],"questions.":[136,202],"also":[138],"design":[139],"automatically":[143],"filter":[144],"synthetic":[145],"answers":[146],"based":[147],"on":[148,181],"perplexity":[150],"shift":[151],"LLMs,":[153],"is":[155],"highly":[156],"aligned":[157],"human":[159],"judgment":[160],"answers'":[162],"validity.":[163],"Collectively,":[164],"these":[165],"methodologies":[166],"culminated":[167],"creation":[170],"61k":[173],"dataset,":[175],"ScIRGen-Geo.":[176],"benchmarked":[178],"representative":[179],"methods":[180,195],"ScIRGen-Geo":[183],"their":[186],"capabilities,":[190],"finding":[191],"out":[192],"current":[194],"still":[196],"suffer":[197],"from":[198,200],"reasoning":[199],"complex":[201],"This":[203],"work":[204],"advances":[205],"development":[207],"sophisticated":[210],"tools":[211],"support":[213],"intricate":[215],"community.":[221]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
