{"id":"https://openalex.org/W6929631384","doi":"https://doi.org/10.48550/arxiv.2506.17673","title":"FaithfulSAE: Towards Capturing Faithful Features with Sparse Autoencoders without External Dataset Dependencies","display_name":"FaithfulSAE: Towards Capturing Faithful Features with Sparse Autoencoders without External Dataset Dependencies","publication_year":2025,"publication_date":"2025-06-21","ids":{"openalex":"https://openalex.org/W6929631384","doi":"https://doi.org/10.48550/arxiv.2506.17673"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2506.17673","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.17673","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2506.17673","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Cho, Seonglae","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Cho, Seonglae","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Oh, Harryn","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oh, Harryn","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lee, Donghyun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Donghyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Vieira, Luis Eduardo Rodrigues","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vieira, Luis Eduardo Rodrigues","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Bermingham, Andrew","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bermingham, Andrew","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Sayed, Ziad El","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sayed, Ziad El","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12673","display_name":"Microbial Metabolism and Applications","score":0.12430000305175781,"subfield":{"id":"https://openalex.org/subfields/1305","display_name":"Biotechnology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T12673","display_name":"Microbial Metabolism and Applications","score":0.12430000305175781,"subfield":{"id":"https://openalex.org/subfields/1305","display_name":"Biotechnology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T12817","display_name":"Pharmacological Effects of Natural Compounds","score":0.054099999368190765,"subfield":{"id":"https://openalex.org/subfields/3004","display_name":"Pharmacology"},"field":{"id":"https://openalex.org/fields/30","display_name":"Pharmacology, Toxicology and Pharmaceutics"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11316","display_name":"Mycobacterium research and diagnosis","score":0.03689999878406525,"subfield":{"id":"https://openalex.org/subfields/2713","display_name":"Epidemiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8140000104904175},{"id":"https://openalex.org/keywords/dependency","display_name":"Dependency (UML)","score":0.5034999847412109},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.4372999966144562},{"id":"https://openalex.org/keywords/hallucinating","display_name":"Hallucinating","score":0.43479999899864197},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.42579999566078186},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4052000045776367},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.34380000829696655},{"id":"https://openalex.org/keywords/support-vector-machine","display_name":"Support vector machine","score":0.33399999141693115},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.3237999975681305}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8140000104904175},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6496999859809875},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6133999824523926},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5317000150680542},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.5034999847412109},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.4372999966144562},{"id":"https://openalex.org/C2911011789","wikidata":"https://www.wikidata.org/wiki/Q130741","display_name":"Hallucinating","level":2,"score":0.43479999899864197},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.42579999566078186},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4052000045776367},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.38510000705718994},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.34380000829696655},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.33399999141693115},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3237999975681305},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2969000041484833},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.2924000024795532},{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.2858999967575073},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C120567893","wikidata":"https://www.wikidata.org/wiki/Q1582085","display_name":"Knowledge extraction","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.27639999985694885},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.26489999890327454},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.2500999867916107},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.23690000176429749},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.22939999401569366},{"id":"https://openalex.org/C199475168","wikidata":"https://www.wikidata.org/wiki/Q1539689","display_name":"MedDRA","level":4,"score":0.22259999811649323},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2160000056028366},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.2117999941110611},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.20739999413490295},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2061000019311905},{"id":"https://openalex.org/C58166","wikidata":"https://www.wikidata.org/wiki/Q224821","display_name":"Fuzzy logic","level":2,"score":0.20239999890327454},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.2012999951839447},{"id":"https://openalex.org/C55282118","wikidata":"https://www.wikidata.org/wiki/Q252683","display_name":"Snapshot (computer storage)","level":2,"score":0.19699999690055847},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.19269999861717224},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.19220000505447388},{"id":"https://openalex.org/C190839683","wikidata":"https://www.wikidata.org/wiki/Q2448197","display_name":"Train","level":2,"score":0.19210000336170197},{"id":"https://openalex.org/C148524875","wikidata":"https://www.wikidata.org/wiki/Q6975395","display_name":"F1 score","level":2,"score":0.19009999930858612},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.1858000010251999},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.1858000010251999},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.18389999866485596},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.18289999663829803},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.18219999969005585}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2506.17673","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.17673","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2506.17673","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.17673","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Sparse":[0],"Autoencoders":[1],"(SAEs)":[2],"have":[3,23,35],"emerged":[4],"as":[5],"a":[6,103,148],"promising":[7],"solution":[8],"for":[9],"decomposing":[10],"large":[11],"language":[12],"model":[13,65],"representations":[14],"into":[15],"interpretable":[16],"features.":[17,44],"However,":[18],"Paulo":[19],"and":[20,30,146],"Belrose":[21],"(2025)":[22,34],"highlighted":[24],"instability":[25],"across":[26,131],"different":[27],"initialization":[28],"seeds,":[29],"Heap":[31],"et":[32],"al.":[33],"pointed":[36],"out":[37,155],"that":[38,90,105,118],"SAEs":[39,51,107,120,127,136],"may":[40,68],"not":[41],"capture":[42],"model-internal":[43,173],"These":[45],"problems":[46],"likely":[47],"stem":[48],"from":[49,58],"training":[50,119,183],"on":[52,108,121,138,165],"external":[53,166],"datasets":[54,124,140],"-":[55,66],"either":[56],"collected":[57],"the":[59,74,92,109,142,163,177],"Web":[60],"or":[61],"generated":[62],"by":[63,170],"another":[64],"which":[67,85],"contain":[69],"out-of-distribution":[70],"(OOD)":[71],"data":[72],"beyond":[73],"model's":[75,93,110],"generalisation":[76],"capabilities.":[77],"This":[78],"can":[79],"result":[80],"in":[81,126,141,153],"hallucinated":[82],"SAE":[83,143,182],"features,":[84],"we":[86,100,116],"term":[87],"\"Fake":[88],"Features\",":[89],"misrepresent":[91],"internal":[94],"activations.":[95],"To":[96],"address":[97],"these":[98],"issues,":[99],"propose":[101],"FaithfulSAE,":[102],"method":[104],"trains":[106],"own":[111],"synthetic":[112],"dataset.":[113],"Using":[114],"FaithfulSAEs,":[115],"demonstrate":[117],"less-OOD":[122],"instruction":[123],"results":[125],"being":[128],"more":[129],"stable":[130],"seeds.":[132],"Notably,":[133],"FaithfulSAEs":[134],"outperform":[135],"trained":[137],"web-based":[139],"probing":[144],"task":[145],"exhibit":[147],"lower":[149],"Fake":[150],"Feature":[151],"Ratio":[152],"5":[154],"of":[156,181],"7":[157],"models.":[158],"Overall,":[159],"our":[160],"approach":[161],"eliminates":[162],"dependency":[164],"datasets,":[167],"advancing":[168],"interpretability":[169],"better":[171],"capturing":[172],"features":[174],"while":[175],"highlighting":[176],"often":[178],"neglected":[179],"importance":[180],"datasets.":[184]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
