{"id":"https://openalex.org/W4396773582","doi":"https://doi.org/10.1145/3664606","title":"Unveiling Code Pre-Trained Models: Investigating Syntax and Semantics Capacities","display_name":"Unveiling Code Pre-Trained Models: Investigating Syntax and Semantics Capacities","publication_year":2024,"publication_date":"2024-05-09","ids":{"openalex":"https://openalex.org/W4396773582","doi":"https://doi.org/10.1145/3664606"},"language":"en","primary_location":{"id":"doi:10.1145/3664606","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664606","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3664606","source":{"id":"https://openalex.org/S142627899","display_name":"ACM Transactions on Software Engineering and Methodology","issn_l":"1049-331X","issn":["1049-331X","1557-7392"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Software Engineering and Methodology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3664606","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057849611","display_name":"Wei Ma","orcid":"https://orcid.org/0000-0002-0044-466X"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Wei Ma","raw_affiliation_strings":["College of Computing and Data Science (CCDS), Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-0044-466X","affiliations":[{"raw_affiliation_string":"College of Computing and Data Science (CCDS), Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045943684","display_name":"Shangqing Liu","orcid":"https://orcid.org/0000-0002-5598-4006"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Shangqing Liu","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-5598-4006","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101793596","display_name":"Mengjie Zhao","orcid":"https://orcid.org/0009-0009-2391-4028"},"institutions":[{"id":"https://openalex.org/I8204097","display_name":"Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen","ror":"https://ror.org/05591te55","country_code":"DE","type":"education","lineage":["https://openalex.org/I8204097"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Mengjie Zhao","raw_affiliation_strings":["Ludwig Maximilian University of Munich, Munich, Germany"],"raw_orcid":"https://orcid.org/0009-0009-2391-4028","affiliations":[{"raw_affiliation_string":"Ludwig Maximilian University of Munich, Munich, Germany","institution_ids":["https://openalex.org/I8204097"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084396416","display_name":"Xiaofei Xie","orcid":"https://orcid.org/0000-0002-1288-6502"},"institutions":[{"id":"https://openalex.org/I79891267","display_name":"Singapore Management University","ror":"https://ror.org/050qmg959","country_code":"SG","type":"education","lineage":["https://openalex.org/I79891267"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Xiaofei Xie","raw_affiliation_strings":["Singapore Management University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-1288-6502","affiliations":[{"raw_affiliation_string":"Singapore Management University, Singapore, Singapore","institution_ids":["https://openalex.org/I79891267"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068123591","display_name":"Wenhan Wang","orcid":"https://orcid.org/0000-0002-0585-2136"},"institutions":[{"id":"https://openalex.org/I154425047","display_name":"University of Alberta","ror":"https://ror.org/0160cpw27","country_code":"CA","type":"education","lineage":["https://openalex.org/I154425047"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Wenhang Wang","raw_affiliation_strings":["University of Alberta, Edmonton, Canada"],"raw_orcid":"https://orcid.org/0000-0002-0585-2136","affiliations":[{"raw_affiliation_string":"University of Alberta, Edmonton, Canada","institution_ids":["https://openalex.org/I154425047"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101406450","display_name":"Qiang Hu","orcid":"https://orcid.org/0000-0002-8251-1669"},"institutions":[{"id":"https://openalex.org/I186903577","display_name":"University of Luxembourg","ror":"https://ror.org/036x5ad56","country_code":"LU","type":"education","lineage":["https://openalex.org/I186903577"]},{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP","LU"],"is_corresponding":false,"raw_author_name":"Qiang Hu","raw_affiliation_strings":["The University of Tokyo, Tokyo, Japan","University of Luxembourg, Esch-sur-Alzette, Luxembourg"],"raw_orcid":"https://orcid.org/0000-0002-8251-1669","affiliations":[{"raw_affiliation_string":"The University of Tokyo, Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"University of Luxembourg, Esch-sur-Alzette, Luxembourg","institution_ids":["https://openalex.org/I186903577"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078725020","display_name":"Junyin Zhang","orcid":"https://orcid.org/0009-0008-6825-1160"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Zhang","raw_affiliation_strings":["Noah's Ark Lab, Huawei, Xi'an, China"],"raw_orcid":"https://orcid.org/0009-0008-6825-1160","affiliations":[{"raw_affiliation_string":"Noah's Ark Lab, Huawei, Xi'an, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100355692","display_name":"Yang Liu","orcid":"https://orcid.org/0000-0001-7300-9215"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yang Liu","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0001-7300-9215","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]}],"institutions":[],"countries_distinct_count":6,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5057849611"],"corresponding_institution_ids":["https://openalex.org/I172675005"],"apc_list":null,"apc_paid":null,"fwci":17.4431,"has_fulltext":false,"cited_by_count":25,"citation_normalized_percentile":{"value":0.99134021,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":"33","issue":"7","first_page":"1","last_page":"29"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.9639999866485596,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9459999799728394,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.833996057510376},{"id":"https://openalex.org/keywords/syntax","display_name":"Syntax","score":0.7913010120391846},{"id":"https://openalex.org/keywords/abstract-syntax-tree","display_name":"Abstract syntax tree","score":0.7796858549118042},{"id":"https://openalex.org/keywords/abstract-syntax","display_name":"Abstract syntax","score":0.7188870310783386},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.6354018449783325},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5726616382598877},{"id":"https://openalex.org/keywords/syntax-error","display_name":"Syntax error","score":0.5504549145698547},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.47488850355148315},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4624076187610626},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4529591202735901},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.42471301555633545}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.833996057510376},{"id":"https://openalex.org/C60048249","wikidata":"https://www.wikidata.org/wiki/Q37437","display_name":"Syntax","level":2,"score":0.7913010120391846},{"id":"https://openalex.org/C58646249","wikidata":"https://www.wikidata.org/wiki/Q127380","display_name":"Abstract syntax tree","level":3,"score":0.7796858549118042},{"id":"https://openalex.org/C114408938","wikidata":"https://www.wikidata.org/wiki/Q333373","display_name":"Abstract syntax","level":3,"score":0.7188870310783386},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.6354018449783325},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5726616382598877},{"id":"https://openalex.org/C11742125","wikidata":"https://www.wikidata.org/wiki/Q1195374","display_name":"Syntax error","level":4,"score":0.5504549145698547},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.47488850355148315},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4624076187610626},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4529591202735901},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.42471301555633545},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3664606","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664606","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3664606","source":{"id":"https://openalex.org/S142627899","display_name":"ACM Transactions on Software Engineering and Methodology","issn_l":"1049-331X","issn":["1049-331X","1557-7392"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Software Engineering and Methodology","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3664606","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664606","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3664606","source":{"id":"https://openalex.org/S142627899","display_name":"ACM Transactions on Software Engineering and Methodology","issn_l":"1049-331X","issn":["1049-331X","1557-7392"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Software Engineering and Methodology","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.6399999856948853,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320320709","display_name":"National Research Foundation Singapore","ror":"https://ror.org/03cpyc314"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4396773582.pdf"},"referenced_works_count":93,"referenced_works":["https://openalex.org/W2109553965","https://openalex.org/W2169044454","https://openalex.org/W2798569372","https://openalex.org/W2799640043","https://openalex.org/W2802049774","https://openalex.org/W2908854766","https://openalex.org/W2963935794","https://openalex.org/W2964204621","https://openalex.org/W2970066309","https://openalex.org/W2973529529","https://openalex.org/W2982399380","https://openalex.org/W2994475624","https://openalex.org/W2997275048","https://openalex.org/W2999309192","https://openalex.org/W3001279689","https://openalex.org/W3008088841","https://openalex.org/W3009290003","https://openalex.org/W3017779903","https://openalex.org/W3021639139","https://openalex.org/W3035882142","https://openalex.org/W3042703469","https://openalex.org/W3046039946","https://openalex.org/W3086007799","https://openalex.org/W3098605233","https://openalex.org/W3099130275","https://openalex.org/W3103410128","https://openalex.org/W3105148046","https://openalex.org/W3108032709","https://openalex.org/W3109966548","https://openalex.org/W3118485687","https://openalex.org/W3119507053","https://openalex.org/W3120991880","https://openalex.org/W3123811550","https://openalex.org/W3126675481","https://openalex.org/W3134616970","https://openalex.org/W3163829544","https://openalex.org/W3170092793","https://openalex.org/W3170962973","https://openalex.org/W3198685994","https://openalex.org/W4206238894","https://openalex.org/W4212774754","https://openalex.org/W4214604534","https://openalex.org/W4221153523","https://openalex.org/W4221166942","https://openalex.org/W4226445156","https://openalex.org/W4283026156","https://openalex.org/W4284710241","https://openalex.org/W4288086191","https://openalex.org/W4288088047","https://openalex.org/W4288351520","https://openalex.org/W4300113433","https://openalex.org/W4304700930","https://openalex.org/W4312727366","https://openalex.org/W4313483544","https://openalex.org/W4313549837","https://openalex.org/W4313563520","https://openalex.org/W4317940259","https://openalex.org/W4319793108","https://openalex.org/W4376122390","https://openalex.org/W4376167329","https://openalex.org/W4376652621","https://openalex.org/W4379512492","https://openalex.org/W4380993527","https://openalex.org/W4382317573","https://openalex.org/W4384302803","https://openalex.org/W4384345647","https://openalex.org/W4384345728","https://openalex.org/W4384918448","https://openalex.org/W4385245566","https://openalex.org/W4385428276","https://openalex.org/W4385573867","https://openalex.org/W4385902209","https://openalex.org/W4385967654","https://openalex.org/W4386081573","https://openalex.org/W4386185625","https://openalex.org/W4387427818","https://openalex.org/W4387839006","https://openalex.org/W4389519352","https://openalex.org/W4391277903","https://openalex.org/W4392414327","https://openalex.org/W4394638297","https://openalex.org/W4394664141","https://openalex.org/W4402665833","https://openalex.org/W6739901393","https://openalex.org/W6751512867","https://openalex.org/W6767098714","https://openalex.org/W6772383348","https://openalex.org/W6810846146","https://openalex.org/W6838461927","https://openalex.org/W6844270267","https://openalex.org/W6848909144","https://openalex.org/W6852746770","https://openalex.org/W6855970221"],"related_works":["https://openalex.org/W2613250302","https://openalex.org/W2077104824","https://openalex.org/W2005927140","https://openalex.org/W2390421503","https://openalex.org/W2387926336","https://openalex.org/W2536864162","https://openalex.org/W2143166528","https://openalex.org/W2051208666","https://openalex.org/W2073072874","https://openalex.org/W966509243"],"abstract_inverted_index":{"Code":[0],"models":[1,24,74,87,107,117,220,232,330],"have":[2,18],"made":[3],"significant":[4],"advancements":[5],"in":[6,25,133,163,182,221,234,255,265,276,315,328,346,358,365],"code":[7,27,39,86,89,100,105,136,144,173,192,204,219,223,236,257,317,329,334,343,359],"intelligence":[8],"by":[9],"encoding":[10,256,316],"knowledge":[11],"about":[12],"programming":[13],"languages.":[14],"While":[15],"previous":[16],"studies":[17],"explored":[19],"the":[20,46,53,68,130,164,178,187,191,197,213,240,246,323,340],"capabilities":[21],"of":[22,48,71,98,199,217,249,294,304,342],"these":[23,231],"learning":[26,73,134,222,235],"syntax,":[28,237],"there":[29],"has":[30],"been":[31],"limited":[32],"investigation":[33,303],"on":[34],"their":[35,253,299,362],"ability":[36],"to":[37,60,75,128,203,331,339],"understand":[38],"semantics.":[40,139,226,318,335],"Additionally,":[41,194],"existing":[42],"analyses":[43],"assume":[44],"that":[45,230,308],"number":[47],"edges":[49],"between":[50,190,243],"nodes":[51],"at":[52],"abstract":[54],"syntax":[55,61,90,137,145,179,224,241,247,347],"tree":[56],"(AST)":[57],"is":[58],"related":[59,202],"distance,":[62],"and":[63,91,111,113,121,138,146,158,186,215,225,245,261,268,348],"also":[64,176],"often":[65],"require":[66],"transforming":[67],"high-dimensional":[69],"space":[70],"deep":[72,300],"a":[76,95],"low-dimensional":[77],"one,":[78],"which":[79],"may":[80],"introduce":[81],"inaccuracies.":[82],"To":[83],"study":[84,337],"how":[85],"represent":[88],"semantics,":[92],"we":[93,195],"conduct":[94],"comprehensive":[96],"analysis":[97],"seven":[99],"models,":[101,360],"including":[102],"four":[103,125],"representative":[104],"pre-trained":[106,289],"(CodeBERT,":[108],"GraphCodeBERT,":[109],"CodeT5,":[110],"UnixCoder)":[112],"three":[114],"large":[115],"language":[116],"(LLMs)":[118],"(StarCoder,":[119],"CodeLlama":[120],"CodeT5+).":[122],"We":[123,175,279],"design":[124],"probing":[126,141],"tasks":[127,142],"assess":[129],"models\u2019":[131,344],"capacities":[132],"both":[135],"These":[140,167],"reconstruct":[143],"semantics":[147,258,349],"structures":[148,168],"(AST,":[149],"control":[150,159,267],"dependence":[151,155],"graph":[152,156,161],"(CDG),":[153],"data":[154,269],"(DDG),":[157],"flow":[160],"(CFG))":[162],"representation":[165,185],"space.":[166],"are":[169],"core":[170],"concepts":[171],"for":[172,325,355],"understanding.":[174],"investigate":[177],"token":[180,184],"role":[181],"each":[183],"long":[188],"dependency":[189],"tokens.":[193,251],"analyze":[196],"distribution":[198],"attention":[200,305,310],"weights":[201,306],"semantic":[205],"structures.":[206],"Through":[207],"extensive":[208],"analysis,":[209],"our":[210],"findings":[211,321,352],"highlight":[212],"strengths":[214],"limitations":[216],"different":[218,309],"The":[227,291,302],"results":[228],"demonstrate":[229,263],"excel":[233],"successfully":[238],"capturing":[239,266],"relationships":[242],"tokens":[244],"roles":[248,314],"individual":[250],"However,":[252],"performance":[254,275],"varies.":[259],"CodeT5":[260],"CodeBERT":[262],"proficiency":[264],"dependencies,":[270],"whereas":[271],"UnixCoder":[272],"shows":[273],"weaker":[274],"this":[277],"aspect.":[278],"do":[280],"not":[281],"observe":[282],"LLMs":[283,295],"generally":[284],"performing":[285],"much":[286],"better":[287,297,332],"than":[288,298],"models.":[290],"shallow":[292],"layers":[293],"perform":[296],"layers.":[301],"reveals":[307],"heads":[311],"play":[312],"distinct":[313],"Our":[319,351],"research":[320],"emphasize":[322],"need":[324],"further":[326],"enhancements":[327],"learn":[333],"This":[336],"contributes":[338],"understanding":[341],"abilities":[345],"analysis.":[350],"provide":[353],"guidance":[354],"future":[356],"improvements":[357],"facilitating":[361],"effective":[363],"application":[364],"various":[366],"code-related":[367],"tasks.":[368]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":19},{"year":2024,"cited_by_count":4}],"updated_date":"2026-06-04T09:04:59.091469","created_date":"2025-10-10T00:00:00"}
