{"id":"https://openalex.org/W7135082738","doi":"https://doi.org/10.48550/arxiv.2603.10300","title":"From Imitation to Intuition: Intrinsic Reasoning for Open-Instance Video Classification","display_name":"From Imitation to Intuition: Intrinsic Reasoning for Open-Instance Video Classification","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135082738","doi":"https://doi.org/10.48550/arxiv.2603.10300"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.10300","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10300","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.10300","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128837403","display_name":"Ke Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046965072","display_name":"Xiangchen Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Xiangchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069782593","display_name":"Yunjie Tian","orcid":"https://orcid.org/0000-0002-5103-3748"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Yunjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128881188","display_name":"Jiayu Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Jiayu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128823016","display_name":"Vishal M. Patel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patel, Vishal M.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128855118","display_name":"Di Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Di","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8256000280380249,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8256000280380249,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.04899999871850014,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.042399998754262924,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.5295000076293945},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5109999775886536},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.49390000104904175},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.37279999256134033},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.3671000003814697},{"id":"https://openalex.org/keywords/imitation","display_name":"Imitation","score":0.3646000027656555},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.3569999933242798},{"id":"https://openalex.org/keywords/model-based-reasoning","display_name":"Model-based reasoning","score":0.34880000352859497}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7178999781608582},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6912999749183655},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.5295000076293945},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5138999819755554},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5109999775886536},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.49390000104904175},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.37279999256134033},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.3671000003814697},{"id":"https://openalex.org/C126388530","wikidata":"https://www.wikidata.org/wiki/Q1131737","display_name":"Imitation","level":2,"score":0.3646000027656555},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3569999933242798},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.34880000352859497},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.3197000026702881},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.30809998512268066},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29280000925064087},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.28700000047683716},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.2777000069618225},{"id":"https://openalex.org/C75294576","wikidata":"https://www.wikidata.org/wiki/Q5165192","display_name":"Contextual image classification","level":3,"score":0.2752000093460083},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.2653999924659729},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.26010000705718994}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.10300","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10300","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.10300","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10300","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Conventional":[0],"video":[1,35,77,158],"classification":[2,78],"models,":[3],"acting":[4],"as":[5],"effective":[6],"imitators,":[7],"excel":[8],"in":[9],"scenarios":[10],"with":[11,69,88],"homogeneous":[12],"data":[13],"distributions.":[14],"However,":[15],"real-world":[16],"applications":[17],"often":[18],"present":[19],"an":[20,70,124],"open-instance":[21,76,157],"challenge,":[22],"where":[23],"intra-class":[24],"variations":[25],"are":[26],"vast":[27],"and":[28,168],"complex,":[29],"beyond":[30],"existing":[31],"benchmarks.":[32],"While":[33],"traditional":[34],"encoder":[36],"models":[37,45],"struggle":[38],"to":[39,81,93,106,114],"fit":[40],"these":[41],"diverse":[42],"distributions,":[43],"vision-language":[44],"(VLMs)":[46],"offer":[47],"superior":[48],"generalization":[49],"but":[50],"have":[51],"not":[52],"fully":[53],"leveraged":[54],"their":[55],"reasoning":[56,72,95,108,117,138],"capabilities":[57],"(intuition)":[58],"for":[59,156],"such":[60],"tasks.":[61],"In":[62,128],"this":[63,67,116,129,136],"paper,":[64],"we":[65],"bridge":[66],"gap":[68],"intrinsic":[71,137,171],"framework":[73],"that":[74,155],"evolves":[75],"from":[79,163],"imitation":[80,167],"intuition.":[82],"Our":[83,173],"approach,":[84],"namely":[85],"DeepIntuit,":[86],"begins":[87],"a":[89,131],"cold-start":[90],"supervised":[91],"alignment":[92],"initialize":[94],"capability,":[96],"followed":[97],"by":[98,141],"refinement":[99],"using":[100],"Group":[101],"Relative":[102],"Policy":[103],"Optimization":[104],"(GRPO)":[105],"enhance":[107],"coherence":[109],"through":[110],"reinforcement":[111],"learning.":[112],"Crucially,":[113],"translate":[115],"into":[118],"accurate":[119],"classification,":[120,159],"DeepIntuit":[121,160],"then":[122],"introduces":[123],"intuitive":[125],"calibration":[126],"stage.":[127],"stage,":[130],"classifier":[132],"is":[133,175],"trained":[134],"on":[135],"traces":[139],"generated":[140],"the":[142],"refined":[143],"VLM,":[144],"ensuring":[145],"stable":[146],"knowledge":[147],"transfer":[148],"without":[149],"distribution":[150],"mismatch.":[151],"Extensive":[152],"experiments":[153],"demonstrate":[154],"benefits":[161],"significantly":[162],"transcending":[164],"simple":[165],"feature":[166],"evolving":[169],"toward":[170],"reasoning.":[172],"project":[174],"available":[176],"at":[177],"https://bwgzk-keke.github.io/DeepIntuit/.":[178]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-03-13T00:00:00"}
