{"id":"https://openalex.org/W7154081199","doi":"https://doi.org/10.48550/arxiv.2604.08826","title":"HiFloat4 Format for Language Model Pre-training on Ascend NPUs","display_name":"HiFloat4 Format for Language Model Pre-training on Ascend NPUs","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7154081199","doi":"https://doi.org/10.48550/arxiv.2604.08826"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.08826","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08826","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.08826","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083845399","display_name":"Mehran Taghian","orcid":"https://orcid.org/0000-0003-4996-353X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Taghian, Mehran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086399135","display_name":"Yunke Peng","orcid":"https://orcid.org/0000-0001-5240-0725"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Yunke","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133527225","display_name":"Xing Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Xing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133506734","display_name":"Yao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133514247","display_name":"Yaoyuan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yaoyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133520978","display_name":"Wei Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063032740","display_name":"Yuanyong Luo","orcid":"https://orcid.org/0000-0002-4450-066X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Yuanyong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111768283","display_name":"Tianchi Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Tianchi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124980534","display_name":"Junsong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Junsong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133537900","display_name":"Xin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133507051","display_name":"Hu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Hu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133526904","display_name":"Yu Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125940068","display_name":"Ziwei Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Ziwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133521344","display_name":"Hongliang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Hongliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133551656","display_name":"Mehdi Rahimifar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rahimifar, Mehdi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133530322","display_name":"Lei Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Lei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133525029","display_name":"Xuefei Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xuefei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133475632","display_name":"Zhuang Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Zhuang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133494807","display_name":"Lei Liu","orcid":"https://orcid.org/0009-0007-8125-5046"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Lei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133519248","display_name":"Hui Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Hui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022932578","display_name":"Anandharaju Durai Raju","orcid":"https://orcid.org/0000-0001-9873-5870"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Raju, Anandharaju Durai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133478761","display_name":"Hoang Le","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Le, Hoang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085639863","display_name":"Hei Yi Mak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mak, Hei Yi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003406651","display_name":"Tanzila Rahman","orcid":"https://orcid.org/0000-0001-8763-8805"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rahman, Tanzila","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5003674260","display_name":"Shadan Golestan","orcid":"https://orcid.org/0000-0001-7906-2287"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Golestan, Shadan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":25,"corresponding_author_ids":["https://openalex.org/A5083845399"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.33379998803138733,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.33379998803138733,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.10320000350475311,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.05719999969005585,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5306000113487244},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.3970000147819519},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.3628000020980835},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.34769999980926514},{"id":"https://openalex.org/keywords/linear-model","display_name":"Linear model","score":0.3465999960899353},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3294000029563904}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8669000267982483},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5306000113487244},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.448199987411499},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43799999356269836},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.3970000147819519},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.3628000020980835},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C163175372","wikidata":"https://www.wikidata.org/wiki/Q3339222","display_name":"Linear model","level":2,"score":0.3465999960899353},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.34599998593330383},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3294000029563904},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.32670000195503235},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.3140999972820282},{"id":"https://openalex.org/C2777115002","wikidata":"https://www.wikidata.org/wiki/Q7168246","display_name":"Performance prediction","level":2,"score":0.3095000088214874},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.30469998717308044},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.3001999855041504},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.27160000801086426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.08826","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08826","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.08826","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08826","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"foundation":[1],"models":[2,24,61],"have":[3],"become":[4],"central":[5],"to":[6,54,65,75,153],"modern":[7],"machine":[8],"learning,":[9],"with":[10,14,97,111],"performance":[11],"scaling":[12],"predictably":[13],"model":[15],"size":[16],"and":[17,21,28,49,71,93,113,129,132,141,189,200],"data.":[18],"However,":[19],"training":[20,36,101,155,186],"deploying":[22],"such":[23],"incur":[25],"substantial":[26],"computational":[27],"memory":[29,72],"costs,":[30],"motivating":[31],"the":[32,83,171,191],"development":[33],"of":[34,166,174,184],"low-precision":[35],"techniques.":[37],"Recent":[38],"work":[39],"has":[40],"demonstrated":[41],"that":[42,156],"4-bit":[43,175],"floating-point":[44],"(FP4)":[45],"formats--such":[46],"as":[47],"MXFP4":[48,98],"NVFP4--can":[50],"be":[51],"successfully":[52],"applied":[53],"linear":[55,112,139],"GEMM":[56,115],"operations":[57,116],"in":[58,68,99,119,145,197],"large":[59],"language":[60],"(LLMs),":[62],"achieving":[63],"up":[64],"4x":[66],"improvements":[67],"compute":[69],"throughput":[70],"efficiency":[73,172],"compared":[74],"higher-precision":[76],"baselines.":[77],"In":[78],"this":[79],"work,":[80],"we":[81,148],"investigate":[82],"recently":[84],"proposed":[85],"HiFloat4":[86],"FP4":[87,120,154,185,195],"format":[88],"for":[89],"Huawei":[90],"Ascend":[91,108],"NPUs":[92,188],"systematically":[94],"compare":[95],"it":[96],"large-scale":[100,198],"settings.":[102],"All":[103],"experiments":[104],"are":[105],"conducted":[106],"on":[107,187],"NPU":[109],"clusters,":[110],"expert":[114],"performed":[117],"entirely":[118],"precision.":[121],"We":[122],"evaluate":[123],"both":[124,137],"dense":[125,199],"architectures":[126],"(e.g.,":[127],"Pangu":[128],"LLaMA-style":[130],"models)":[131],"mixture-of-experts":[133],"(MoE)":[134],"models,":[135],"where":[136],"standard":[138],"layers":[140],"expert-specific":[142],"GEMMs":[143],"operate":[144],"FP4.":[146],"Furthermore,":[147],"explore":[149],"stabilization":[150],"techniques":[151],"tailored":[152],"significantly":[157],"reduce":[158],"numerical":[159],"degradation,":[160],"maintaining":[161],"relative":[162],"error":[163],"within":[164],"1%":[165],"full-precision":[167],"baselines":[168],"while":[169],"preserving":[170],"benefits":[173],"computation.":[176],"Our":[177],"results":[178],"provide":[179],"a":[180],"comprehensive":[181],"empirical":[182],"study":[183],"highlight":[190],"practical":[192],"trade-offs":[193],"between":[194],"formats":[196],"MoE":[201],"models.":[202]},"counts_by_year":[],"updated_date":"2026-04-14T06:08:25.285971","created_date":"2026-04-14T00:00:00"}
