{"id":"https://openalex.org/W7133568806","doi":"https://doi.org/10.48550/arxiv.2603.03084","title":"On the Expressive Power of Transformers for Maxout Networks and Continuous Piecewise Linear Functions","display_name":"On the Expressive Power of Transformers for Maxout Networks and Continuous Piecewise Linear Functions","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W7133568806","doi":"https://doi.org/10.48550/arxiv.2603.03084"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.03084","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03084","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.03084","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128107292","display_name":"Linyan Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Linyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100526897","display_name":"Lihua Yang","orcid":"https://orcid.org/0000-0002-5397-5798"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Lihua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128061409","display_name":"Feng Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Feng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.23180000483989716,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.23180000483989716,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.17139999568462372,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.1273999959230423,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7555999755859375},{"id":"https://openalex.org/keywords/feed-forward","display_name":"Feed forward","score":0.5302000045776367},{"id":"https://openalex.org/keywords/piecewise-linear-function","display_name":"Piecewise linear function","score":0.5267000198364258},{"id":"https://openalex.org/keywords/affine-transformation","display_name":"Affine transformation","score":0.5210000276565552},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.460099995136261},{"id":"https://openalex.org/keywords/expressive-power","display_name":"Expressive power","score":0.43149998784065247},{"id":"https://openalex.org/keywords/linear-approximation","display_name":"Linear approximation","score":0.3986999988555908},{"id":"https://openalex.org/keywords/feedforward-neural-network","display_name":"Feedforward neural network","score":0.3912999927997589}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7555999755859375},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6074000000953674},{"id":"https://openalex.org/C38858127","wikidata":"https://www.wikidata.org/wiki/Q5441228","display_name":"Feed forward","level":2,"score":0.5302000045776367},{"id":"https://openalex.org/C17095337","wikidata":"https://www.wikidata.org/wiki/Q2375229","display_name":"Piecewise linear function","level":2,"score":0.5267000198364258},{"id":"https://openalex.org/C92757383","wikidata":"https://www.wikidata.org/wiki/Q382497","display_name":"Affine transformation","level":2,"score":0.5210000276565552},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.460099995136261},{"id":"https://openalex.org/C195818886","wikidata":"https://www.wikidata.org/wiki/Q5421724","display_name":"Expressive power","level":2,"score":0.43149998784065247},{"id":"https://openalex.org/C160824197","wikidata":"https://www.wikidata.org/wiki/Q2071054","display_name":"Linear approximation","level":3,"score":0.3986999988555908},{"id":"https://openalex.org/C47702885","wikidata":"https://www.wikidata.org/wiki/Q5441227","display_name":"Feedforward neural network","level":3,"score":0.3912999927997589},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3684999942779541},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3402999937534332},{"id":"https://openalex.org/C164660894","wikidata":"https://www.wikidata.org/wiki/Q2037833","display_name":"Piecewise","level":2,"score":0.33489999175071716},{"id":"https://openalex.org/C47446073","wikidata":"https://www.wikidata.org/wiki/Q5165890","display_name":"Control theory (sociology)","level":3,"score":0.30889999866485596},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2858999967575073},{"id":"https://openalex.org/C41045048","wikidata":"https://www.wikidata.org/wiki/Q202843","display_name":"Linear programming","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C163175372","wikidata":"https://www.wikidata.org/wiki/Q3339222","display_name":"Linear model","level":2,"score":0.2818000018596649},{"id":"https://openalex.org/C184720557","wikidata":"https://www.wikidata.org/wiki/Q7825049","display_name":"Topology (electrical circuits)","level":2,"score":0.27709999680519104},{"id":"https://openalex.org/C158622935","wikidata":"https://www.wikidata.org/wiki/Q660848","display_name":"Nonlinear system","level":2,"score":0.27630001306533813},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2678000032901764},{"id":"https://openalex.org/C148764684","wikidata":"https://www.wikidata.org/wiki/Q621751","display_name":"Approximation algorithm","level":2,"score":0.2526000142097473},{"id":"https://openalex.org/C104122410","wikidata":"https://www.wikidata.org/wiki/Q1416406","display_name":"Network model","level":2,"score":0.2522999942302704}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.03084","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03084","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.03084","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03084","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Transformer":[0,30,42,115],"networks":[1,40,43,60,113],"have":[2],"achieved":[3],"remarkable":[4],"empirical":[5],"success":[6],"across":[7],"a":[8,50,71,103],"wide":[9],"range":[10],"of":[11,29,38,58,77,92],"applications,":[12],"yet":[13],"their":[14,87],"theoretical":[15,104],"expressive":[16,27],"power":[17],"remains":[18],"insufficiently":[19],"understood.":[20],"In":[21],"this":[22,67],"paper,":[23],"we":[24,69],"study":[25],"the":[26,54,75,90],"capabilities":[28],"architectures.":[31,116],"We":[32],"first":[33],"establish":[34],"an":[35],"explicit":[36],"approximation":[37,56,76,107],"maxout":[39],"by":[41,82],"while":[44,129],"preserving":[45],"comparable":[46],"model":[47],"complexity.":[48],"As":[49],"consequence,":[51],"Transformers":[52,83],"inherit":[53],"universal":[55],"capability":[57],"ReLU":[59],"under":[61],"similar":[62],"complexity":[63],"constraints.":[64],"Building":[65],"on":[66],"connection,":[68],"develop":[70],"framework":[72],"to":[73],"analyze":[74],"continuous":[78],"piecewise":[79],"linear":[80,93],"functions":[81],"and":[84,114],"quantitatively":[85],"characterize":[86],"expressivity":[88],"via":[89],"number":[91],"regions,":[94],"which":[95],"grows":[96],"exponentially":[97],"with":[98],"depth.":[99],"Our":[100],"analysis":[101],"establishes":[102],"bridge":[105],"between":[106],"theory":[108],"for":[109],"standard":[110],"feedforward":[111,130],"neural":[112],"It":[117],"also":[118],"yields":[119],"structural":[120],"insights":[121],"into":[122],"Transformers:":[123],"self-attention":[124],"layers":[125,131],"implement":[126],"max-type":[127],"operations,":[128],"realize":[132],"token-wise":[133],"affine":[134],"transformations.":[135]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-05T00:00:00"}
