{"id":"https://openalex.org/W7137887570","doi":"https://doi.org/10.48550/arxiv.2603.13552","title":"Ghosts of Softmax: Complex Singularities That Limit Safe Step Sizes in Cross-Entropy","display_name":"Ghosts of Softmax: Complex Singularities That Limit Safe Step Sizes in Cross-Entropy","publication_year":2026,"publication_date":"2026-03-13","ids":{"openalex":"https://openalex.org/W7137887570","doi":"https://doi.org/10.48550/arxiv.2603.13552"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.13552","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13552","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.13552","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048170299","display_name":"Piyush Sao","orcid":"https://orcid.org/0000-0002-9432-5855"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sao, Piyush","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5048170299"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.4880000054836273,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.4880000054836273,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10581","display_name":"Neural dynamics and brain function","score":0.05299999937415123,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T12611","display_name":"Neural Networks and Reservoir Computing","score":0.03849999979138374,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.6917999982833862},{"id":"https://openalex.org/keywords/hessian-matrix","display_name":"Hessian matrix","score":0.5884000062942505},{"id":"https://openalex.org/keywords/logarithm","display_name":"Logarithm","score":0.5205000042915344},{"id":"https://openalex.org/keywords/skew","display_name":"Skew","score":0.5091000199317932},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.4650000035762787},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.43389999866485596},{"id":"https://openalex.org/keywords/boundary","display_name":"Boundary (topology)","score":0.412200003862381},{"id":"https://openalex.org/keywords/upper-and-lower-bounds","display_name":"Upper and lower bounds","score":0.39739999175071716},{"id":"https://openalex.org/keywords/limit","display_name":"Limit (mathematics)","score":0.38119998574256897},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.38100001215934753}],"concepts":[{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.6917999982833862},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.6334999799728394},{"id":"https://openalex.org/C203616005","wikidata":"https://www.wikidata.org/wiki/Q620495","display_name":"Hessian matrix","level":2,"score":0.5884000062942505},{"id":"https://openalex.org/C39927690","wikidata":"https://www.wikidata.org/wiki/Q11197","display_name":"Logarithm","level":2,"score":0.5205000042915344},{"id":"https://openalex.org/C43711488","wikidata":"https://www.wikidata.org/wiki/Q7534783","display_name":"Skew","level":2,"score":0.5091000199317932},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.4650000035762787},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.4449000060558319},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.43389999866485596},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.41620001196861267},{"id":"https://openalex.org/C62354387","wikidata":"https://www.wikidata.org/wiki/Q875399","display_name":"Boundary (topology)","level":2,"score":0.412200003862381},{"id":"https://openalex.org/C77553402","wikidata":"https://www.wikidata.org/wiki/Q13222579","display_name":"Upper and lower bounds","level":2,"score":0.39739999175071716},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.38119998574256897},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.38100001215934753},{"id":"https://openalex.org/C22324862","wikidata":"https://www.wikidata.org/wiki/Q652707","display_name":"Lipschitz continuity","level":2,"score":0.3727000057697296},{"id":"https://openalex.org/C12843","wikidata":"https://www.wikidata.org/wiki/Q201721","display_name":"Gravitational singularity","level":2,"score":0.37209999561309814},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.3716999888420105},{"id":"https://openalex.org/C129844170","wikidata":"https://www.wikidata.org/wiki/Q41299","display_name":"Quadratic equation","level":2,"score":0.35370001196861267},{"id":"https://openalex.org/C81388566","wikidata":"https://www.wikidata.org/wiki/Q526668","display_name":"Sigmoid function","level":3,"score":0.34389999508857727},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.33739998936653137},{"id":"https://openalex.org/C195065555","wikidata":"https://www.wikidata.org/wiki/Q214881","display_name":"Curvature","level":2,"score":0.33660000562667847},{"id":"https://openalex.org/C178635117","wikidata":"https://www.wikidata.org/wiki/Q747499","display_name":"RADIUS","level":2,"score":0.33090001344680786},{"id":"https://openalex.org/C72134830","wikidata":"https://www.wikidata.org/wiki/Q5166524","display_name":"Convexity","level":2,"score":0.3163999915122986},{"id":"https://openalex.org/C155568369","wikidata":"https://www.wikidata.org/wiki/Q1428097","display_name":"Radius of convergence","level":3,"score":0.31540000438690186},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.314300000667572},{"id":"https://openalex.org/C132459708","wikidata":"https://www.wikidata.org/wiki/Q744069","display_name":"Extrapolation","level":2,"score":0.3070000112056732},{"id":"https://openalex.org/C205203396","wikidata":"https://www.wikidata.org/wiki/Q612143","display_name":"Bilinear interpolation","level":2,"score":0.3046000003814697},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.3028999865055084},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.29249998927116394},{"id":"https://openalex.org/C75438885","wikidata":"https://www.wikidata.org/wiki/Q3403615","display_name":"Large deviations theory","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.2879999876022339},{"id":"https://openalex.org/C2780695682","wikidata":"https://www.wikidata.org/wiki/Q4005959","display_name":"Jump","level":2,"score":0.2865000069141388},{"id":"https://openalex.org/C2780323453","wikidata":"https://www.wikidata.org/wiki/Q7113957","display_name":"Overshoot (microwave communication)","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C157709441","wikidata":"https://www.wikidata.org/wiki/Q1411887","display_name":"Uniform convergence","level":3,"score":0.2847999930381775},{"id":"https://openalex.org/C158946198","wikidata":"https://www.wikidata.org/wiki/Q131187","display_name":"Taylor series","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C57869625","wikidata":"https://www.wikidata.org/wiki/Q1783502","display_name":"Rate of convergence","level":3,"score":0.26809999346733093},{"id":"https://openalex.org/C11210021","wikidata":"https://www.wikidata.org/wiki/Q1520713","display_name":"Linearization","level":3,"score":0.2639000117778778},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C42812","wikidata":"https://www.wikidata.org/wiki/Q1082910","display_name":"Partition (number theory)","level":2,"score":0.25110000371932983},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.13552","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13552","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.13552","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13552","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Optimization":[0],"analyses":[1],"for":[2,186],"cross-entropy":[3,240],"training":[4],"rely":[5],"on":[6,239],"local":[7],"Taylor":[8,30,245],"models":[9],"of":[10,33,69,127],"the":[11,21,29,34,38,52,58,77,96,101,104,111,116,125,159,163,197,203],"loss":[12,36,78],"to":[13,158,210,221],"predict":[14],"whether":[15],"a":[16,144,152,236],"proposed":[17,97,160],"step":[18,145,167,181],"will":[19],"decrease":[20],"objective.":[22],"These":[23],"surrogates":[24],"are":[25,149],"reliable":[26],"only":[27],"inside":[28],"convergence":[31,246],"radius":[32,42,106],"true":[35],"along":[37,95],"update":[39,98],"direction.":[40,99],"That":[41],"is":[43,107,124],"set":[44],"not":[45],"by":[46,51,200],"real-line":[47],"curvature":[48],"alone":[49],"but":[50],"nearest":[53],"complex":[54,65],"singularity.":[55],"For":[56],"cross-entropy,":[57],"softmax":[59],"partition":[60],"function":[61],"$F=\\sum_j":[62],"\\exp(z_j)$":[63],"has":[64],"zeros":[66],"--":[67,71],"``ghosts":[68],"softmax''":[70],"that":[72,148,214,242],"induce":[73],"logarithmic":[74],"singularities":[75],"in":[76,224],"and":[79,140,155,179],"cap":[80],"this":[81,85],"radius.":[82,164],"To":[83],"make":[84],"geometry":[86],"usable,":[87],"we":[88,114],"derive":[89],"closed-form":[90],"expressions":[91],"under":[92],"logit":[93,129],"linearization":[94],"In":[100,110],"binary":[102],"case,":[103,113],"exact":[105],"$\u03c1^*=\\sqrt{\u03b4^2+":[108],"\u03c0^2}/\u0394_a$.":[109],"multiclass":[112],"obtain":[115],"lower":[117],"bound":[118,135],"$\u03c1_a=\u03c0/\u0394_a$,":[119],"where":[120,227],"$\u0394_a=\\max_k":[121],"a_k-\\min_k":[122],"a_k$":[123],"spread":[126,205],"directional":[128],"derivatives":[130],"$a_k=\\nabla":[131],"z_k\\cdot":[132],"v$.":[133],"This":[134],"costs":[136],"one":[137],"Jacobian-vector":[138],"product":[139],"reveals":[141],"what":[142],"makes":[143],"fragile:":[146],"samples":[147],"both":[150],"near":[151],"decision":[153],"flip":[154],"highly":[156],"sensitive":[157],"direction":[161],"tighten":[162],"The":[165],"normalized":[166],"size":[168],"$r=\u03c4/\u03c1_a$":[169],"separates":[170],"safe":[171],"from":[172,206],"dangerous":[173],"updates.":[174],"Across":[175],"six":[176],"tested":[177],"architectures":[178],"multiple":[180],"directions,":[182],"no":[183],"model":[184],"fails":[185],"$r&lt;1$,":[187],"yet":[188],"collapse":[189],"appears":[190],"once":[191],"$r\\ge":[192],"1$.":[193],"Temperature":[194],"scaling":[195],"confirms":[196],"mechanism:":[198],"normalizing":[199],"$\u03c1_a$":[201],"shrinks":[202],"onset-threshold":[204],"standard":[207],"deviation":[208],"$0.992$":[209],"$0.164$.":[211],"A":[212],"controller":[213],"enforces":[215],"$\u03c4\\le\u03c1_a$":[216],"survives":[217],"learning-rate":[218],"spikes":[219],"up":[220],"$10{,}":[222],"000\\times$":[223],"our":[225],"tests,":[226],"gradient":[228],"clipping":[229],"still":[230],"collapses.":[231],"Together,":[232],"these":[233],"results":[234],"identify":[235],"geometric":[237],"constraint":[238],"optimization":[241],"operates":[243],"through":[244],"rather":[247],"than":[248],"Hessian":[249],"curvature.":[250]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
