{"id":"https://openalex.org/W7105515016","doi":"https://doi.org/10.15496/publikation-113350","title":"Towards Better Video Understanding through Language Guidance","display_name":"Towards Better Video Understanding through Language Guidance","publication_year":2025,"publication_date":"2025-11-10","ids":{"openalex":"https://openalex.org/W7105515016","doi":"https://doi.org/10.15496/publikation-113350"},"language":"en","primary_location":{"id":"doi:10.15496/publikation-113350","is_oa":true,"landing_page_url":"https://doi.org/10.15496/publikation-113350","pdf_url":null,"source":{"id":"https://openalex.org/S7407053000","display_name":"Universit\u00e4tsbibliothek T\u00fcbingen","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"other","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.15496/publikation-113350","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Hummel, Thomas","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hummel, Thomas","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":null,"topics":[],"keywords":[{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.5932000279426575},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5598000288009644},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5551999807357788},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5339000225067139},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5054000020027161},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4115999937057495},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.40070000290870667},{"id":"https://openalex.org/keywords/robotics","display_name":"Robotics","score":0.39149999618530273}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7835000157356262},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6001999974250793},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.5932000279426575},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5598000288009644},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5551999807357788},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5339000225067139},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5054000020027161},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4115999937057495},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.40070000290870667},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.39149999618530273},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.361299991607666},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3564999997615814},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.34459999203681946},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3181999921798706},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.31369999051094055},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3005000054836273},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.3001999855041504},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.29820001125335693},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.2858999967575073},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.2718999981880188},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.2676999866962433},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2651999890804291},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.2558000087738037},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.25360000133514404},{"id":"https://openalex.org/C2988167200","wikidata":"https://www.wikidata.org/wiki/Q16885149","display_name":"Online video","level":2,"score":0.2535000145435333}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.15496/publikation-113350","is_oa":true,"landing_page_url":"https://doi.org/10.15496/publikation-113350","pdf_url":null,"source":{"id":"https://openalex.org/S7407053000","display_name":"Universit\u00e4tsbibliothek T\u00fcbingen","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.15496/publikation-113350","is_oa":true,"landing_page_url":"https://doi.org/10.15496/publikation-113350","pdf_url":null,"source":{"id":"https://openalex.org/S7407053000","display_name":"Universit\u00e4tsbibliothek T\u00fcbingen","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Video":[0],"understanding":[1,27,241,276,339,402],"is":[2,180],"a":[3,29,135,153,198,218,273,288,295,309,348],"crucial":[4],"area":[5],"of":[6,37,57,117,277,323,341,364],"computer":[7],"vision,":[8],"with":[9,94,149,196],"applications":[10],"ranging":[11,59],"from":[12,60,375],"autonomous":[13],"driving":[14],"and":[15,46,74,79,86,159,174,206,267,319,350,382,399],"robotics":[16],"to":[17,33,41,63,70,75,113,140,182,191,204,216,221,254,299,314,378],"multimedia":[18],"interaction.":[19],"Despite":[20],"significant":[21],"progress":[22],"in":[23,101,110,152],"image":[24],"analysis,":[25],"video":[26,102,108,143,194,232,278,285,297,317,338,393,401],"remains":[28],"complex":[30],"problem":[31],"due":[32],"the":[34,115,164,320,328,335,361,387],"temporal":[35,172,337,356],"nature":[36],"videos,":[38],"requiring":[39],"models":[40,139,188,343],"analyse":[42],"both":[43,316],"individual":[44],"frames":[45],"their":[47],"relationships":[48,173],"over":[49],"time.":[50],"This":[51,245],"work":[52,89,388],"explores":[53,90,238],"how":[54,91],"various":[55],"forms":[56],"language,":[58],"class":[61,126,376],"labels":[62,127,377],"natural":[64,291,383],"language":[65,93,292,366,370,384],"instructions,":[66],"can":[67,97],"be":[68],"leveraged":[69],"overcome":[71],"these":[72],"challenges":[73,100],"improve":[76],"model":[77,220,228,271,310],"capabilities":[78,340],"generalisation.":[80],"Moreover,":[81],"through":[82,242],"novel":[83,231],"settings,":[84],"benchmarks,":[85,207],"frameworks,":[87],"this":[88,178,236,281,307],"integrating":[92,369],"visual":[95],"information":[96,257],"address":[98,114],"key":[99],"understanding.":[103],"First,":[104],"we":[105,208,345],"explore":[106],"audio-visual":[107,122,147,183],"classification":[109,394],"low-data":[111],"regimes":[112],"limitations":[116],"traditional":[118,248,392],"supervised":[119],"learning.":[120],"In":[121,202],"generalised":[123,184],"zero-shot":[124],"learning,":[125,186],"represented":[128],"as":[129,134],"pre-trained":[130],"word":[131],"embeddings":[132,263],"serve":[133],"semantic":[136],"bridge,":[137],"enabling":[138],"classify":[141,192],"unseen":[142],"classes":[144,195],"by":[145,169,251,358],"aligning":[146],"features":[148],"textual":[150,324],"representations":[151,215],"shared":[154],"embedding":[155],"space.":[156],"Our":[157],"Temporal":[158],"Cross-Attention":[160],"Framework":[161],"(TCaF)":[162],"improves":[163],"alignment":[165],"and,":[166],"consequently,":[167],"generalisation":[168,229],"better":[170],"modelling":[171],"cross-modal":[175],"interactions.":[176],"Next,":[177],"setting":[179],"extended":[181],"few-shot":[185],"where":[187,290],"must":[189],"learn":[190],"new":[193],"only":[197],"few":[199],"labelled":[200],"examples.":[201],"addition":[203],"protocols":[205],"propose":[209,327],"AV-Diff,":[210],"which":[211,333],"uses":[212],"class-label":[213],"text":[214],"guide":[217],"diffusion":[219],"generate":[222],"synthetic":[223],"training":[224],"samples,":[225],"thereby":[226],"enhancing":[227],"for":[230],"classes.":[233],"Beyond":[234],"classification,":[235],"thesis":[237,282],"fine-grained":[239,336,379,400],"action":[240,249,380],"video-adverb":[243],"retrieval.":[244],"task":[246,289],"extends":[247],"recognition":[250],"incorporating":[252],"adverbs":[253],"provide":[255],"richer":[256],"about":[258],"actions.":[259],"By":[260,368],"learning":[261],"compositional":[262,312],"that":[264,353],"combine":[265],"actions":[266],"adverbs,":[268],"our":[269],"proposed":[270],"achieves":[272,354],"more":[274,396],"nuanced":[275],"content.":[279],"Finally,":[280],"tackles":[283],"composed":[284],"retrieval":[286],"(CVR),":[287],"instructions":[293,385],"modify":[294],"reference":[296],"query":[298],"retrieve":[300],"semantically":[301],"altered":[302],"videos.":[303],"To":[304],"successfully":[305],"solve":[306],"task,":[308],"requires":[311],"reasoning":[313,357,362],"interpret":[315],"content":[318],"transformative":[321],"effect":[322],"instructions.":[325],"We":[326],"egocentric":[329],"evaluation":[330],"benchmark":[331],"EgoCVR,":[332],"tests":[334],"vision-language":[342],"Furthermore,":[344],"present":[346],"TFR-CVR,":[347],"modular":[349],"training-free":[351],"framework":[352],"improved":[355],"strategically":[359],"utilising":[360],"abilities":[363],"large":[365],"models.":[367],"at":[371],"different":[372],"levels":[373],"\u2013":[374,386],"modifications":[381],"presented":[389],"pushes":[390],"beyond":[391],"towards":[395],"robust,":[397],"flexible,":[398],"capabilities.":[403]},"counts_by_year":[],"updated_date":"2025-11-13T11:05:06.444514","created_date":"2025-11-13T00:00:00"}
