{"id":"https://openalex.org/W7160832027","doi":"https://doi.org/10.48550/arxiv.2605.07053","title":"GSM-SEM: Benchmark and Framework for Generating Semantically Variant Augmentations","display_name":"GSM-SEM: Benchmark and Framework for Generating Semantically Variant Augmentations","publication_year":2026,"publication_date":"2026-05-08","ids":{"openalex":"https://openalex.org/W7160832027","doi":"https://doi.org/10.48550/arxiv.2605.07053"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.07053","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.07053","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.07053","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135872290","display_name":"Jyotika Singh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singh, Jyotika","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108651780","display_name":"Fang Tu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tu, Fang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135863241","display_name":"Aziza Mirzadova","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mirsaidova, Aziza","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135907830","display_name":"Amit Agarwal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Agarwal, Amit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114200451","display_name":"Hitesh Laxmichand Patel","orcid":"https://orcid.org/0009-0009-7492-5173"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patel, Hitesh Laxmichand","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059423710","display_name":"Sandip Ghoshal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ghoshal, Sandip","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101731788","display_name":"Miguel Ballesteros","orcid":"https://orcid.org/0000-0003-3949-0361"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ballesteros, Miguel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109466332","display_name":"Yassine Benajiba","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dua, Karan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135891941","display_name":"Weiyi Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Benajiba, Yassine","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135881484","display_name":"Graham Horwood","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Weiyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135891485","display_name":"Sujith Ravi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sheng, Tao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135887468","display_name":"Dan Roth","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Horwood, Graham","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ravi, Sujith","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ravi, Sujith","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Roth, Dan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roth, Dan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13523","display_name":"Mathematics, Computing, and Information Processing","score":0.3165999948978424,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13523","display_name":"Mathematics, Computing, and Information Processing","score":0.3165999948978424,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10126","display_name":"Logic, programming, and type systems","score":0.06920000165700912,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11450","display_name":"Model-Driven Software Engineering Techniques","score":0.06239999830722809,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7361999750137329},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6539000272750854},{"id":"https://openalex.org/keywords/memorization","display_name":"Memorization","score":0.6025000214576721},{"id":"https://openalex.org/keywords/variance","display_name":"Variance (accounting)","score":0.5645999908447266},{"id":"https://openalex.org/keywords/variation","display_name":"Variation (astronomy)","score":0.4593000113964081},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3767000138759613}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7422000169754028},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7361999750137329},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6539000272750854},{"id":"https://openalex.org/C30038468","wikidata":"https://www.wikidata.org/wiki/Q4354775","display_name":"Memorization","level":2,"score":0.6025000214576721},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.5645999908447266},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.4593000113964081},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4004000127315521},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3926999866962433},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3767000138759613},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32739999890327454},{"id":"https://openalex.org/C8272713","wikidata":"https://www.wikidata.org/wiki/Q176737","display_name":"Stochastic process","level":2,"score":0.2953999936580658},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26420000195503235},{"id":"https://openalex.org/C2778180026","wikidata":"https://www.wikidata.org/wiki/Q18378163","display_name":"Semantic heterogeneity","level":4,"score":0.26089999079704285}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.07053","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.07053","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.07053","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.07053","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Benchmarks":[0],"like":[1],"GSM8K":[2,136],"are":[3,165],"popular":[4],"measures":[5],"of":[6,19,130,178],"mathematical":[7],"reasoning,":[8],"but":[9],"leaderboard":[10],"gains":[11],"can":[12,43],"overstate":[13],"true":[14],"capability":[15],"due":[16],"to":[17,89,98,192,202],"memorization":[18,46],"fixed":[20],"test":[21],"sets.":[22],"Most":[23],"robustness":[24],"variants":[25,63,110,186],"apply":[26,133,200],"surface-level":[27],"perturbations":[28,164],"(paraphrases,":[29],"renamings,":[30],"number":[31],"swaps,":[32],"distractors)":[33],"that":[34],"largely":[35],"preserve":[36,99],"the":[37,100,128,183],"underlying":[38,84],"facts,":[39],"and":[40,55,86,103,125,137,143,148,208],"static":[41,120],"releases":[42],"themselves":[44],"become":[45],"targets":[47],"over":[48],"time.":[49],"We":[50,132,180],"introduce":[51],"GSM-SEM,":[52],"a":[53],"reusable":[54],"stochastic":[56],"framework":[57],"for":[58,123],"generating":[59],"semantically":[60],"diverse":[61],"benchmark":[62],"with":[64,159,167],"substantially":[65],"higher":[66],"semantic":[67,163],"variance":[68],"than":[69],"prior":[70],"approaches.":[71],"GSM-SEM":[72,107,134,201],"perturbs":[73],"problem":[74,105],"statements":[75],"by":[76],"modifying":[77],"entities,":[78],"attributes,":[79],"and/or":[80],"relationships,":[81],"frequently":[82],"altering":[83],"facts":[85],"requiring":[87,115],"models":[88],"recompute":[90],"solutions":[91],"under":[92],"new":[93],"conditions,":[94],"while":[95],"constraining":[96],"generation":[97],"original":[101],"calculations/answer":[102],"approximate":[104],"difficulty.":[106],"generates":[108],"fresh":[109],"on":[111,119,135],"each":[112],"run":[113],"without":[114],"re-annotation,":[116],"reducing":[117],"reliance":[118],"public":[121],"benchmarks":[122,204],"evaluation":[124],"thereby":[126],"lowering":[127],"bias":[129],"memorization.":[131],"two":[138],"existing":[139],"variation":[140],"suites":[141],"(GSM-Symbolic":[142],"GSM-Plus),":[144],"producing":[145],"GSM8K-SEM,":[146],"GSM-Symbolic-SEM,":[147],"GSM-Plus-SEM.":[149],"Evaluating":[150],"14":[151],"SOTA":[152],"LLMs,":[153],"we":[154,199],"observe":[155],"consistent":[156],"performance":[157],"drops":[158],"larger":[160],"decline":[161],"when":[162],"coupled":[166],"symbolic/plus":[168],"variations":[169],"(average":[170],"drop":[171],"rate":[172],"28%":[173],"in":[174],"maximum":[175],"strictness":[176],"configuration":[177],"GSM-SEM).":[179],"publicly":[181],"release":[182],"three":[184],"SEM":[185],"as":[187],"fully":[188],"human-validated":[189],"datasets.":[190],"Finally,":[191],"demonstrate":[193],"applicability":[194],"beyond":[195],"GSM-style":[196],"math":[197],"problems,":[198],"additional":[203],"including":[205],"BigBenchHard,":[206],"LogicBench,":[207],"NLR-BIRD.":[209]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-12T00:00:00"}
