{"id":"https://openalex.org/W7141309902","doi":"https://doi.org/10.48550/arxiv.2603.25267","title":"EagleNet: Energy-Aware Fine-Grained Relationship Learning Network for Text-Video Retrieval","display_name":"EagleNet: Energy-Aware Fine-Grained Relationship Learning Network for Text-Video Retrieval","publication_year":2026,"publication_date":"2026-03-26","ids":{"openalex":"https://openalex.org/W7141309902","doi":"https://doi.org/10.48550/arxiv.2603.25267"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.25267","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25267","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.25267","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130760359","display_name":"Yuhan Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Yuhan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130732650","display_name":"Pengwen Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Pengwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130756648","display_name":"Chuan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Chuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002436544","display_name":"Dayan Wu","orcid":"https://orcid.org/0000-0002-8604-7226"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Dayan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130752029","display_name":"Xiaochun Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Xiaochun","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5130760359"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9801999926567078,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9801999926567078,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.0032999999821186066,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.0031999999191612005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5425999760627747},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5152000188827515},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.438400000333786},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4072999954223633},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.4065000116825104},{"id":"https://openalex.org/keywords/text-corpus","display_name":"Text corpus","score":0.35019999742507935},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.32739999890327454}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7840999960899353},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5810999870300293},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5702000260353088},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5425999760627747},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5152000188827515},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.438400000333786},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4072999954223633},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4065000116825104},{"id":"https://openalex.org/C2474386","wikidata":"https://www.wikidata.org/wiki/Q461183","display_name":"Text corpus","level":2,"score":0.35019999742507935},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.32739999890327454},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.3264000117778778},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.3246000111103058},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.3188000023365021},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.30219998955726624},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.298799991607666},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2759000062942505},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.27140000462532043},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2531000077724457},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.25267","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25267","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.25267","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25267","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5952616930007935}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Text-video":[0],"retrieval":[1],"tasks":[2],"have":[3,196],"seen":[4],"significant":[5],"improvements":[6],"due":[7],"to":[8,35,75,91,130,157],"the":[9,38,58,65,100,113,159,168,185,191,198],"recent":[10,28],"development":[11],"of":[12,161,170,200],"large-scale":[13],"vision-language":[14],"pre-trained":[15],"models.":[16],"Traditional":[17],"methods":[18,45],"primarily":[19],"focus":[20],"on":[21],"video":[22],"representations":[23],"or":[24],"cross-modal":[25,178],"alignment,":[26],"while":[27],"works":[29],"shift":[30],"toward":[31],"enriching":[32],"text":[33,50,68,78,97,115,132,137],"expressiveness":[34],"better":[36],"match":[37],"rich":[39,55],"semantics":[40],"in":[41,150],"videos.":[42],"However,":[43],"these":[44],"use":[46],"only":[47],"interactions":[48,56,163],"between":[49,77],"and":[51,53,79,94,117,124,164,180,206],"frames/video,":[52],"ignore":[54],"among":[57,122],"internal":[59],"frames":[60],"within":[61],"a":[62,109],"video,":[63],"so":[64],"final":[66],"expanded":[67],"cannot":[69],"capture":[70,167],"frame":[71,141],"contextual":[72,142],"information,":[73],"leading":[74],"disparities":[76],"video.":[80],"In":[81],"response,":[82],"we":[83,152,183],"introduce":[84],"Energy-Aware":[85,154],"Fine-Grained":[86,102],"Relationship":[87,103],"Learning":[88,104],"Network":[89],"(EagleNet)":[90],"generate":[92],"accurate":[93],"context-aware":[95],"enriched":[96,136],"embeddings.":[98],"Specifically,":[99],"proposed":[101],"mechanism":[105],"(FRL)":[106],"first":[107],"constructs":[108],"text-frame":[110,162],"graph":[111],"by":[112],"generated":[114],"candidates":[116,133],"frames,":[118,125],"then":[119],"learns":[120],"relationships":[121],"texts":[123],"which":[126],"are":[127,209],"finally":[128],"used":[129],"aggregate":[131],"into":[134],"an":[135],"embedding":[138],"that":[139],"incorporates":[140],"information.":[143],"To":[144],"further":[145],"improve":[146],"fine-grained":[147],"relationship":[148],"learning":[149],"FRL,":[151],"design":[153],"Matching":[155],"(EAM)":[156],"model":[158],"energy":[160],"thus":[165],"accurately":[166],"distribution":[169],"real":[171],"text-video":[172],"pairs.":[173],"Moreover,":[174],"for":[175],"more":[176],"effective":[177],"alignment":[179],"stable":[181],"training,":[182],"replace":[184],"conventional":[186],"softmax-based":[187],"contrastive":[188],"loss":[189],"with":[190],"sigmoid":[192],"loss.":[193],"Extensive":[194],"experiments":[195],"demonstrated":[197],"superiority":[199],"EagleNet":[201],"across":[202],"MSRVTT,":[203],"DiDeMo,":[204],"MSVD,":[205],"VATEX.":[207],"Codes":[208],"available":[210],"at":[211],"https://github.com/draym28/EagleNet.":[212]},"counts_by_year":[],"updated_date":"2026-03-28T06:16:51.555046","created_date":"2026-03-28T00:00:00"}
