{"id":"https://openalex.org/W7160864579","doi":"https://doi.org/10.48550/arxiv.2605.06809","title":"LookWhen? Fast Video Recognition by Learning When, Where, and What to Compute","display_name":"LookWhen? Fast Video Recognition by Learning When, Where, and What to Compute","publication_year":2026,"publication_date":"2026-05-07","ids":{"openalex":"https://openalex.org/W7160864579","doi":"https://doi.org/10.48550/arxiv.2605.06809"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.06809","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06809","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.06809","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135838731","display_name":"Ali Salamatian","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Salamatian, Ali","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071556402","display_name":"Anthony Fuller","orcid":"https://orcid.org/0000-0001-8187-5850"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fuller, Anthony","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101861367","display_name":"Pritam Sarkar","orcid":"https://orcid.org/0000-0003-4000-3604"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sarkar, Pritam","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135903440","display_name":"James R. Green","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Green, James R.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135879475","display_name":"Leonid Sigal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sigal, Leonid","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135834891","display_name":"Evan Shelhamer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shelhamer, Evan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5135838731"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.6442999839782715,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.6442999839782715,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.056699998676776886,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.05570000037550926,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.534600019454956},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.49720001220703125},{"id":"https://openalex.org/keywords/video-processing","display_name":"Video processing","score":0.45969998836517334},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4097999930381775},{"id":"https://openalex.org/keywords/extractor","display_name":"Extractor","score":0.38190001249313354},{"id":"https://openalex.org/keywords/uniqueness","display_name":"Uniqueness","score":0.3614000082015991},{"id":"https://openalex.org/keywords/feature-selection","display_name":"Feature selection","score":0.3605000078678131}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7979999780654907},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6082000136375427},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.534600019454956},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.49720001220703125},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.45969998836517334},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4097999930381775},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.38589999079704285},{"id":"https://openalex.org/C117978034","wikidata":"https://www.wikidata.org/wiki/Q5422192","display_name":"Extractor","level":2,"score":0.38190001249313354},{"id":"https://openalex.org/C2777021972","wikidata":"https://www.wikidata.org/wiki/Q22976830","display_name":"Uniqueness","level":2,"score":0.3614000082015991},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.3605000078678131},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.34769999980926514},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.34369999170303345},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.3425999879837036},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.33709999918937683},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.31029999256134033},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C2778598663","wikidata":"https://www.wikidata.org/wiki/Q1407599","display_name":"Video content analysis","level":4,"score":0.2827000021934509},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.2533000111579895}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.06809","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06809","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.06809","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06809","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Transformers":[0],"dominate":[1],"video":[2,40,56,120],"recognition.":[3],"They":[4],"split":[5],"videos":[6,18],"into":[7,42],"tokens,":[8],"and":[9,46,57,92,122,146,165,179,199],"processing":[10,79],"them":[11],"has":[12],"expensive":[13],"superlinear":[14],"computational":[15],"cost.":[16],"Yet":[17],"are":[19],"filled":[20],"with":[21],"redundancy,":[22],"so":[23],"we":[24,97,116,128,167],"can":[25],"question":[26],"the":[27,69,81],"need":[28],"for":[29,90,126,149],"this":[30],"expense.":[31],"We":[32],"introduce":[33,98],"LookWhen,":[34],"a":[35,54,99,109,119,155,172],"selector-extractor":[36,143],"framework":[37],"that":[38,103,169],"factorizes":[39],"recognition":[41],"learning":[43],"when,":[44],"where,":[45],"what":[47,135],"to":[48,73,133,154],"compute.":[49],"Our":[50],"shallow":[51],"selector":[52],"gets":[53,68],"scaled-down":[55],"quickly":[58],"scores":[59],"all":[60,80],"tokens":[61,72,105],"across":[62],"space-time,":[63],"while":[64],"our":[65,142],"deep":[66],"extractor":[67],"top-K":[70],"selected":[71],"approximate":[74],"full-video":[75],"representations":[76,102,132,148],"without":[77],"actually":[78],"tokens.":[82],"A":[83],"key":[84],"challenge":[85],"is":[86,211],"defining":[87],"effective":[88],"supervision":[89],"selection":[91,95],"extraction.":[93],"For":[94,113],"pre-training,":[96,115],"score":[100],"on":[101,159,189,202],"ranks":[104],"by":[106],"uniqueness":[107],"using":[108],"simple":[110],"nearest-neighbor":[111],"distance.":[112],"extraction":[114,151],"distill":[117],"both":[118],"teacher":[121],"an":[123],"image":[124],"teacher,":[125],"which":[127],"normalize":[129],"its":[130],"frame-wise":[131],"learn":[134],"changes":[136],"within":[137],"videos.":[138],"Through":[139,157],"these":[140],"strategies,":[141],"learns":[144],"general":[145],"efficient":[147,177,213],"feature":[150],"or":[152],"fine-tuning":[153],"task.":[156],"experiments":[158],"Kinetics-400,":[160],"SSv2,":[161],"Epic-Kitchens,":[162],"Diving48,":[163],"Jester,":[164],"Charades,":[166],"show":[168],"LookWhen":[170,185,210],"achieves":[171],"better":[173],"accuracy-computation":[174],"trade-off":[175],"than":[176,218],"models":[178],"upgraded":[180],"baselines":[181],"of":[182,191],"similar":[183],"size.":[184],"Pareto-dominates":[186],"in":[187,208],"accuracy-FLOPs":[188],"9":[190],"12":[192],"cases":[193],"(6":[194],"tasks":[195],"x":[196],"2":[197],"settings)":[198],"roughly":[200],"matches":[201],"3.":[203],"In":[204],"accuracy-throughput,":[205],"measuring":[206],"time":[207],"practice,":[209],"more":[212],"still":[214],"at":[215,220],"6.7x":[216],"faster":[217],"InternVideo2-B":[219],"equal":[221],"accuracy.":[222]},"counts_by_year":[],"updated_date":"2026-05-12T06:14:25.881160","created_date":"2026-05-12T00:00:00"}
