{"id":"https://openalex.org/W7128374676","doi":"https://doi.org/10.48550/arxiv.2602.06471","title":"Revisiting the Shape Convention of Transformer Language Models","display_name":"Revisiting the Shape Convention of Transformer Language Models","publication_year":2026,"publication_date":"2026-02-06","ids":{"openalex":"https://openalex.org/W7128374676","doi":"https://doi.org/10.48550/arxiv.2602.06471"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.06471","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067740015","display_name":"Feng-Ting Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liao, Feng-Ting","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chen, Meng-Hsi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Meng-Hsi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120308860","display_name":"Guan-Ting Yi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi, Guan-Ting","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5035060493","display_name":"Da-shan Shiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shiu, Da-shan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2700999975204468,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2700999975204468,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.1867000013589859,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.09939999878406525,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/hourglass","display_name":"Hourglass","score":0.8047000169754028},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.6446999907493591},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.6287999749183655},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6269000172615051},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.3174999952316284},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.30959999561309814}],"concepts":[{"id":"https://openalex.org/C127532173","wikidata":"https://www.wikidata.org/wiki/Q179904","display_name":"Hourglass","level":2,"score":0.8047000169754028},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6504999995231628},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.6446999907493591},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.6287999749183655},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6269000172615051},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3898000121116638},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3174999952316284},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.30959999561309814},{"id":"https://openalex.org/C2778952367","wikidata":"https://www.wikidata.org/wiki/Q3491532","display_name":"Combing","level":2,"score":0.30630001425743103},{"id":"https://openalex.org/C2780608745","wikidata":"https://www.wikidata.org/wiki/Q367293","display_name":"Convention","level":2,"score":0.3050999939441681},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.2994000017642975},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.29429998993873596},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.26109999418258667},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.25690001249313354}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.06471","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.06471","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.06471","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.06471","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Dense":[0],"Transformer":[1,78],"language":[2,220],"models":[3],"have":[4],"largely":[5],"adhered":[6],"to":[7,32,115,159,168],"one":[8],"consistent":[9,182],"architectural":[10],"shape:":[11],"each":[12],"layer":[13],"consists":[14],"of":[15,68,93,203],"an":[16],"attention":[17,179,212],"module":[18],"followed":[19],"by":[20,43,97,123,136],"a":[21,26,77,86,91,103,112,125,201],"feed-forward":[22],"network":[23],"(FFN)":[24],"with":[25,85,174],"narrow-wide-narrow":[27,70,205],"MLP,":[28],"allocating":[29],"most":[30],"parameters":[31,121,180],"the":[33,58,66,69,82,116,204,209],"MLP":[34,60,206],"at":[35,165,187],"expansion":[36],"ratios":[37],"between":[38,211],"2":[39],"and":[40,119,161,177,199,208,213,217],"4.":[41],"Motivated":[42],"recent":[44,197],"results":[45],"that":[46,80,102,120],"residual":[47,98],"wide-narrow-wide":[48],"(hourglass)":[49],"MLPs":[50],"offer":[51],"superior":[52],"function":[53],"approximation":[54],"capabilities,":[55],"we":[56,75],"revisit":[57],"long-standing":[59],"shape":[61],"convention":[62,207],"in":[63],"Transformer,":[64],"challenging":[65],"necessity":[67],"design.":[71],"To":[72],"study":[73],"this,":[74],"develop":[76],"variant":[79],"replaces":[81],"conventional":[83,117,156,185],"FFN":[84,108,128,172,176,214],"deeper":[87,104],"hourglass-shaped":[88],"FFN,":[89,118],"comprising":[90],"stack":[92],"hourglass":[94,107,127,153,171],"sub-MLPs":[95],"connected":[96],"pathways.":[99],"We":[100,144],"posit":[101],"but":[105],"lighter":[106,126],"can":[109,129],"serve":[110],"as":[111,135],"competitive":[113],"alternative":[114],"saved":[122],"using":[124],"be":[130],"more":[131],"effectively":[132],"utilized,":[133],"such":[134],"enlarging":[137],"model":[138,151],"hidden":[139],"dimensions":[140],"under":[141],"fixed":[142],"budgets.":[143,189],"confirm":[145],"these":[146,191],"through":[147],"empirical":[148],"validations":[149],"across":[150],"scales:":[152],"FFNs":[154,157],"outperform":[155],"up":[158],"400M":[160],"achieve":[162],"comparable":[163],"performance":[164],"larger":[166],"scales":[167],"1B":[169],"parameters;":[170],"variants":[173],"reduced":[175],"increased":[178],"show":[181],"improvements":[183],"over":[184],"configurations":[186],"matched":[188],"Together,":[190],"findings":[192],"shed":[193],"new":[194],"light":[195],"on":[196],"work":[198],"prompt":[200],"rethinking":[202],"balance":[210],"towards":[215],"efficient":[216],"expressive":[218],"modern":[219],"models.":[221]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-10T00:00:00"}
