{"id":"https://openalex.org/W7155371970","doi":"https://doi.org/10.48550/arxiv.2604.19775","title":"From Actions to Understanding: Conformal Interpretability of Temporal Concepts in LLM Agents","display_name":"From Actions to Understanding: Conformal Interpretability of Temporal Concepts in LLM Agents","publication_year":2026,"publication_date":"2026-03-27","ids":{"openalex":"https://openalex.org/W7155371970","doi":"https://doi.org/10.48550/arxiv.2604.19775"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.19775","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.19775","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.19775","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120580917","display_name":"Trilok Padhi","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Padhi, Trilok","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134391633","display_name":"Ramneet Kaur","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaur, Ramneet","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116971398","display_name":"Krishiv Agarwal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Agarwal, Krishiv","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134448980","display_name":"Adam D. Cobb","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cobb, Adam D.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007525901","display_name":"Daniel Elenius","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Elenius, Daniel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019134968","display_name":"Manoj Acharya","orcid":"https://orcid.org/0000-0003-0223-3556"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Acharya, Manoj","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011915615","display_name":"Colin Samplawski","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Samplawski, Colin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048112493","display_name":"Alexander Berenbeim","orcid":"https://orcid.org/0000-0001-8573-4270"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Berenbeim, Alexander M.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134418599","display_name":"Nathaniel D. Bastian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bastian, Nathaniel D.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035902535","display_name":"Susmit Jha","orcid":"https://orcid.org/0000-0001-5983-9095"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jha, Susmit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134389710","display_name":"Anirban Roy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roy, Anirban","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5120580917"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.6266000270843506,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.6266000270843506,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.12129999697208405,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.040800001472234726,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.9480999708175659},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5601999759674072},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.3718000054359436},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.3628000020980835},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.3456999957561493},{"id":"https://openalex.org/keywords/path","display_name":"Path (computing)","score":0.3450999855995178},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.34299999475479126},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.3010999858379364}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.9480999708175659},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7003999948501587},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5982999801635742},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5601999759674072},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.3718000054359436},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.3628000020980835},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3456999957561493},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.3450999855995178},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.34299999475479126},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3010999858379364},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2962000072002411},{"id":"https://openalex.org/C98214594","wikidata":"https://www.wikidata.org/wiki/Q850275","display_name":"Conformal map","level":2,"score":0.2955999970436096},{"id":"https://openalex.org/C25016198","wikidata":"https://www.wikidata.org/wiki/Q781833","display_name":"Temporal logic","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2870999872684479},{"id":"https://openalex.org/C153701036","wikidata":"https://www.wikidata.org/wiki/Q659974","display_name":"Trustworthiness","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2831000089645386},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.27810001373291016},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C13687954","wikidata":"https://www.wikidata.org/wiki/Q4826847","display_name":"Autonomous agent","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.26350000500679016},{"id":"https://openalex.org/C130187892","wikidata":"https://www.wikidata.org/wiki/Q133327","display_name":"Spacetime","level":2,"score":0.2565999925136566},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.25600001215934753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.19775","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.19775","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.19775","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.19775","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6546921133995056,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"are":[4,90,137],"increasingly":[5],"deployed":[6],"as":[7,84,184,186],"autonomous":[8,196],"agents":[9,52],"capable":[10],"of":[11,48,99,115],"reasoning,":[12],"planning,":[13],"and":[14,27,130],"acting":[15],"within":[16],"interactive":[17,126,201],"environments.":[18],"Despite":[19],"their":[20,33],"growing":[21],"capability":[22],"to":[23,75,96,112],"perform":[24],"multi-step":[25],"reasoning":[26,119],"decision-making":[28],"tasks,":[29,66],"internal":[30,79],"mechanisms":[31],"guiding":[32],"sequential":[34],"behavior":[35],"remain":[36],"opaque.":[37],"This":[38],"paper":[39],"presents":[40],"a":[41,54,177],"framework":[42,63,162],"for":[43,64,163,180],"interpreting":[44],"the":[45,60,106,160,165,170,192],"temporal":[46,65,100,135],"evolution":[47],"concepts":[49,101,136],"in":[50,105,188,199],"LLM":[51,155],"through":[53],"step-wise":[55,69],"conformal":[56,61,73],"lens.":[57],"We":[58,147],"introduce":[59],"interpretability":[62],"which":[67],"combines":[68],"reward":[70],"modeling":[71],"with":[72,144],"prediction":[74],"statistically":[76],"label":[77],"model's":[78,107],"representation":[80],"at":[81],"each":[82],"step":[83],"successful":[85,167],"or":[86,118],"failing.":[87],"Linear":[88],"probes":[89],"then":[91],"trained":[92],"on":[93,123,152],"these":[94,134],"representations":[95],"identify":[97],"directions":[98,104,168],"-":[102],"latent":[103],"activation":[108],"space":[109],"that":[110,133],"correspond":[111],"consistent":[113],"notions":[114],"success,":[116],"failure":[117,182],"drift.":[120],"Experimental":[121],"results":[122,151],"two":[124],"simulated":[125],"environments,":[127],"namely":[128],"ScienceWorld":[129],"AlfWorld,":[131],"demonstrate":[132],"linearly":[138],"separable,":[139],"revealing":[140],"interpretable":[141],"structures":[142],"aligned":[143],"task":[145],"success.":[146],"further":[148],"show":[149],"preliminary":[150],"improving":[153],"an":[154],"agent's":[156],"performance":[157],"by":[158],"leveraging":[159],"proposed":[161,173],"steering":[164],"identified":[166],"inside":[169],"model.":[171],"The":[172],"approach,":[174],"thus,":[175],"offers":[176],"principled":[178],"method":[179],"early":[181],"detection":[183],"well":[185],"intervention":[187],"LLM-based":[189],"agents,":[190],"paving":[191],"path":[193],"towards":[194],"trustworthy":[195],"language":[197],"models":[198],"complex":[200],"settings.":[202]},"counts_by_year":[],"updated_date":"2026-05-04T08:30:34.212998","created_date":"2026-04-24T00:00:00"}
