{"id":"https://openalex.org/W4320067944","doi":"https://doi.org/10.1145/3545008.3545032","title":"IATF: An Input-Aware Tuning Framework for Compact BLAS Based on ARMv8 CPUs","display_name":"IATF: An Input-Aware Tuning Framework for Compact BLAS Based on ARMv8 CPUs","publication_year":2022,"publication_date":"2022-08-29","ids":{"openalex":"https://openalex.org/W4320067944","doi":"https://doi.org/10.1145/3545008.3545032"},"language":"en","primary_location":{"id":"doi:10.1145/3545008.3545032","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3545008.3545032","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3545008.3545032","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 51st International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3545008.3545032","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5039652101","display_name":"Cunyang Wei","orcid":"https://orcid.org/0009-0001-8910-4951"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cunyang Wei","raw_affiliation_strings":["The Institute of Computing Technology of the Chinese Academy of Sciences, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Institute of Computing Technology of the Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101816350","display_name":"Haipeng Jia","orcid":"https://orcid.org/0000-0002-9855-5367"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haipeng Jia","raw_affiliation_strings":["The Institute of Computing Technology of the Chinese Academy of Sciences, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Institute of Computing Technology of the Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001666028","display_name":"Yunquan Zhang","orcid":"https://orcid.org/0000-0002-2618-5088"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunquan Zhang","raw_affiliation_strings":["The Institute of Computing Technology of the Chinese Academy of Sciences, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Institute of Computing Technology of the Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007541774","display_name":"Liusha Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liusha Xu","raw_affiliation_strings":["Huawei Technologies Co., Ltd., China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Huawei Technologies Co., Ltd., China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100612749","display_name":"Ji Qi","orcid":"https://orcid.org/0000-0003-4585-6534"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ji Qi","raw_affiliation_strings":["Huawei Technologies Co., Ltd., China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Huawei Technologies Co., Ltd., China","institution_ids":["https://openalex.org/I2250955327"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.3997,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":{"value":0.8109319,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7884336709976196},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.6993494629859924},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6275362372398376},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.6257190704345703},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.5144684314727783},{"id":"https://openalex.org/keywords/simd","display_name":"SIMD","score":0.4749229848384857},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.2314666211605072},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.0932198166847229}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7884336709976196},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.6993494629859924},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6275362372398376},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.6257190704345703},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.5144684314727783},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.4749229848384857},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2314666211605072},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0932198166847229},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3545008.3545032","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3545008.3545032","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3545008.3545032","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 51st International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3545008.3545032","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3545008.3545032","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3545008.3545032","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 51st International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1982466500","display_name":"\u9762\u5411\u4f17\u6838\u4f53\u7cfb\u67b6\u6784\u7684\u5e76\u884c\u8ba1\u7b97\u6a21\u578b\u4e0e\u6027\u80fd\u81ea\u9002\u5e94\u4f18\u5316\u7814\u7a76","funder_award_id":"61972376","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5832729736","display_name":null,"funder_award_id":"62032023","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G704334800","display_name":null,"funder_award_id":"62072431","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8826344451","display_name":null,"funder_award_id":"No. 61972376, No. 62072431","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4320067944.pdf"},"referenced_works_count":19,"referenced_works":["https://openalex.org/W1983157164","https://openalex.org/W2043275593","https://openalex.org/W2064872546","https://openalex.org/W2073061372","https://openalex.org/W2084379367","https://openalex.org/W2252007067","https://openalex.org/W2273809747","https://openalex.org/W2499931820","https://openalex.org/W2767612671","https://openalex.org/W2786374423","https://openalex.org/W2949953997","https://openalex.org/W2971383048","https://openalex.org/W2972087877","https://openalex.org/W2984305089","https://openalex.org/W3099506637","https://openalex.org/W3166510811","https://openalex.org/W3210601829","https://openalex.org/W4244254628","https://openalex.org/W4288083057"],"related_works":["https://openalex.org/W2291920536","https://openalex.org/W2022397046","https://openalex.org/W2162726111","https://openalex.org/W1603583590","https://openalex.org/W338671845","https://openalex.org/W2063770303","https://openalex.org/W2018511057","https://openalex.org/W2286348849","https://openalex.org/W1587248296","https://openalex.org/W2001175489"],"abstract_inverted_index":{"Recently":[0],"the":[1,91,142,149,175],"mainstream":[2,197],"basic":[3],"linear":[4],"algebra":[5],"libraries":[6,23],"have":[7],"delivered":[8],"high":[9],"performance":[10,30,76,188],"on":[11,34,41,77,95],"large":[12,35,65,165],"scale":[13],"General":[14],"Matrix":[15],"Multiplication(GEMM)":[16],"and":[17,71,87,107,118,171,192],"Triangular":[18],"System":[19],"Solve(TRSM).":[20],"However,":[21],"these":[22],"are":[24,45],"still":[25],"insufficient":[26],"to":[27,73,113,123,140,158,174],"provide":[28],"sustained":[29],"for":[31,63,104,137,164],"batch":[32],"operations":[33],"groups":[36],"of":[37,67,144,167],"fixed-size":[38,68,168],"small":[39,69,169],"matrices":[40],"specific":[42],"architectures,":[43],"which":[44],"extensively":[46],"used":[47],"in":[48,190],"various":[49],"scientific":[50],"computing":[51,101,138],"applications.":[52],"In":[53,90,148],"this":[54],"paper,":[55],"we":[56,99,152],"propose":[57,100],"IATF,":[58],"an":[59,129,154,160],"input-aware":[60,155],"tuning":[61,156],"framework":[62],"optimizing":[64],"group":[66,166],"GEMM":[70,106,170,191],"TRSM":[72,193],"boost":[74],"near-optimal":[75],"ARMv8":[78],"architecture.":[79],"The":[80,179],"IATF":[81,184],"contains":[82],"two":[83],"stages:":[84],"install-time":[85,92],"stage":[86],"run-time":[88,150],"stage.":[89],"stage,":[93,151],"based":[94],"SIMD-friendly":[96],"data":[97,131],"layout,":[98],"kernel":[102,111,120,125],"templates":[103],"high-performance":[105],"TRSM,":[108,172],"analyze":[109],"optimal":[110],"sizes":[112],"increase":[114],"computational":[115],"instruction":[116],"ratio,":[117],"design":[119],"optimization":[121],"strategies":[122],"improve":[124],"execution":[126,162],"efficiency.":[127],"Furthermore,":[128],"optimized":[130],"packing":[132],"strategy":[133],"is":[134],"also":[135],"presented":[136],"kernels":[139],"minimize":[141],"cost":[143],"memory":[145],"accessing":[146],"overhead.":[147],"present":[153],"method":[157],"generate":[159],"efficient":[161],"plan":[163],"according":[173],"input":[176],"matrix":[177],"properties.":[178],"experimental":[180],"results":[181],"show":[182],"that":[183],"could":[185],"achieve":[186],"significant":[187],"improvements":[189],"compared":[194],"with":[195],"other":[196],"BLAS":[198],"libraries.":[199]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
