{"id":"https://openalex.org/W7155638752","doi":"https://doi.org/10.1016/j.cviu.2026.104772","title":"EARS4SEE: A multimodal audio description system dedicated to blind and visually impaired users","display_name":"EARS4SEE: A multimodal audio description system dedicated to blind and visually impaired users","publication_year":2026,"publication_date":"2026-04-25","ids":{"openalex":"https://openalex.org/W7155638752","doi":"https://doi.org/10.1016/j.cviu.2026.104772"},"language":"en","primary_location":{"id":"doi:10.1016/j.cviu.2026.104772","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.cviu.2026.104772","pdf_url":null,"source":{"id":"https://openalex.org/S185008460","display_name":"Computer Vision and Image Understanding","issn_l":"1077-3142","issn":["1077-3142","1090-235X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computer Vision and Image Understanding","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1016/j.cviu.2026.104772","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025204322","display_name":"Ruxandra \u021aapu","orcid":"https://orcid.org/0000-0003-3170-4150"},"institutions":[{"id":"https://openalex.org/I205703379","display_name":"Institut Mines-T\u00e9l\u00e9com","ror":"https://ror.org/025vp2923","country_code":"FR","type":"facility","lineage":["https://openalex.org/I205703379"]},{"id":"https://openalex.org/I4210145102","display_name":"Institut Polytechnique de Paris","ror":"https://ror.org/042tfbd02","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210145102"]},{"id":"https://openalex.org/I61641377","display_name":"Universitatea Na\u021bional\u0103 de \u0218tiin\u021b\u0103 \u0219i Tehnologie Politehnica Bucure\u0219ti","ror":"https://ror.org/0558j5q12","country_code":"RO","type":"education","lineage":["https://openalex.org/I61641377"]}],"countries":["FR","RO"],"is_corresponding":true,"raw_author_name":"Ruxandra Tapu","raw_affiliation_strings":["Department of Telecommunications, Faculty of ETTI, University \u201cPolitehnica\u201d of Bucharest, Romania","Institut Polytechnique de Paris, T\u00e9l\u00e9com SudParis, ARTEMIS Department, 9 rue Charles Fourier, 91000 \u00c9vry, France"],"raw_orcid":"https://orcid.org/0000-0003-3170-4150","affiliations":[{"raw_affiliation_string":"Department of Telecommunications, Faculty of ETTI, University \u201cPolitehnica\u201d of Bucharest, Romania","institution_ids":["https://openalex.org/I61641377"]},{"raw_affiliation_string":"Institut Polytechnique de Paris, T\u00e9l\u00e9com SudParis, ARTEMIS Department, 9 rue Charles Fourier, 91000 \u00c9vry, France","institution_ids":["https://openalex.org/I205703379","https://openalex.org/I4210145102"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5134572193","display_name":"Bogdan Mocanu","orcid":null},"institutions":[{"id":"https://openalex.org/I61641377","display_name":"Universitatea Na\u021bional\u0103 de \u0218tiin\u021b\u0103 \u0219i Tehnologie Politehnica Bucure\u0219ti","ror":"https://ror.org/0558j5q12","country_code":"RO","type":"education","lineage":["https://openalex.org/I61641377"]}],"countries":["RO"],"is_corresponding":false,"raw_author_name":"Bogdan Mocanu","raw_affiliation_strings":["Department of Telecommunications, Faculty of ETTI, University \u201cPolitehnica\u201d of Bucharest, Romania"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Telecommunications, Faculty of ETTI, University \u201cPolitehnica\u201d of Bucharest, Romania","institution_ids":["https://openalex.org/I61641377"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5025204322"],"corresponding_institution_ids":["https://openalex.org/I205703379","https://openalex.org/I4210145102","https://openalex.org/I61641377"],"apc_list":{"value":2370,"currency":"USD","value_usd":2370},"apc_paid":{"value":2370,"currency":"USD","value_usd":2370},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.96092218,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":"268","issue":null,"first_page":"104772","last_page":"104772"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.6312999725341797,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.6312999725341797,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.1826999932527542,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10914","display_name":"Tactile and Sensory Interactions","score":0.04600000008940697,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.599399983882904},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5964999794960022},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.4871000051498413},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4803999960422516},{"id":"https://openalex.org/keywords/coarticulation","display_name":"Coarticulation","score":0.46560001373291016},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.37599998712539673},{"id":"https://openalex.org/keywords/multimodality","display_name":"Multimodality","score":0.3393000066280365},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.32919999957084656}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8578000068664551},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.599399983882904},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5964999794960022},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5098000168800354},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.4871000051498413},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4803999960422516},{"id":"https://openalex.org/C130727458","wikidata":"https://www.wikidata.org/wiki/Q1639109","display_name":"Coarticulation","level":3,"score":0.46560001373291016},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.43970000743865967},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.37599998712539673},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.3393000066280365},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.32919999957084656},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.32429999113082886},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.31769999861717224},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.3043000102043152},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2937999963760376},{"id":"https://openalex.org/C62230096","wikidata":"https://www.wikidata.org/wiki/Q275969","display_name":"Crowdsourcing","level":2,"score":0.2847999930381775},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.27970001101493835},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2745000123977661},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.2736999988555908},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.27140000462532043},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.26330000162124634},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.25450000166893005},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.25130000710487366},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.25}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1016/j.cviu.2026.104772","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.cviu.2026.104772","pdf_url":null,"source":{"id":"https://openalex.org/S185008460","display_name":"Computer Vision and Image Understanding","issn_l":"1077-3142","issn":["1077-3142","1090-235X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computer Vision and Image Understanding","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1016/j.cviu.2026.104772","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.cviu.2026.104772","pdf_url":null,"source":{"id":"https://openalex.org/S185008460","display_name":"Computer Vision and Image Understanding","issn_l":"1077-3142","issn":["1077-3142","1090-235X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computer Vision and Image Understanding","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.7792025804519653}],"awards":[],"funders":[{"id":"https://openalex.org/F4320323983","display_name":"Unitatea Executiva pentru Finantarea Invatamantului Superior, a Cercetarii, Dezvoltarii si Inovarii","ror":"https://ror.org/01q7jq182"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W27089656","https://openalex.org/W1956340063","https://openalex.org/W1976848015","https://openalex.org/W2055251102","https://openalex.org/W2120645068","https://openalex.org/W2522595994","https://openalex.org/W2601243251","https://openalex.org/W2789824578","https://openalex.org/W2884913105","https://openalex.org/W2962799512","https://openalex.org/W2963916161","https://openalex.org/W2963919999","https://openalex.org/W2969985801","https://openalex.org/W2987802279","https://openalex.org/W3032130225","https://openalex.org/W3034364644","https://openalex.org/W3034552680","https://openalex.org/W3095481265","https://openalex.org/W3203209821","https://openalex.org/W3206789311","https://openalex.org/W3216765867","https://openalex.org/W3217340782","https://openalex.org/W4214663214","https://openalex.org/W4288083805","https://openalex.org/W4312463400","https://openalex.org/W4312472480","https://openalex.org/W4312849330","https://openalex.org/W4385822293","https://openalex.org/W4386066097","https://openalex.org/W4386071559","https://openalex.org/W4386075553","https://openalex.org/W4388196066","https://openalex.org/W4389519587","https://openalex.org/W4390873033","https://openalex.org/W4390874338","https://openalex.org/W4402660135","https://openalex.org/W4402727538","https://openalex.org/W4402727693","https://openalex.org/W4405102071","https://openalex.org/W4411417376","https://openalex.org/W7133196460"],"related_works":[],"abstract_inverted_index":{"In":[0,38,194],"recent":[1,265],"years,":[2],"automatic":[3],"audio":[4,80,235],"description":[5],"(AD)":[6],"generation":[7,50],"has":[8],"become":[9],"an":[10,100,123,145,223],"important":[11],"research":[12],"domain":[13],"within":[14],"accessibility":[15],"and":[16,29,58,66,79,136,160,171,186,191,222],"assistive":[17],"technology,":[18],"driven":[19],"by":[20],"its":[21],"potential":[22],"to":[23,62,155,183,198],"enhance":[24,63],"content":[25],"understanding,":[26],"social":[27],"integration,":[28],"cognitive":[30],"engagement":[31],"for":[32,48,82,99,233],"individuals":[33],"with":[34,212],"visual":[35],"impairments":[36],"(VI).":[37],"this":[39],"paper,":[40],"we":[41],"introduce":[42],"EARS4SEE,":[43],"a":[44],"novel":[45],"multimodal":[46,147],"framework":[47,232],"AD":[49,267],"that":[51],"integrates":[52,72],"semantic":[53],"video":[54,148],"analysis,":[55],"character":[56,113,120,251],"tracking,":[57],"adaptive":[59,124],"temporal":[60,125,153,239,248],"segmentation":[61,149,240],"contextual":[64,161],"coherence":[65,259],"narrative":[67,258],"fluency.":[68],"The":[69,128,163],"proposed":[70,108,179,202],"system":[71],"multi-stream":[73],"fusion":[74],"strategy,":[75],"leveraging":[76],"visual,":[77],"textual,":[78],"modalities":[81],"character-centric,":[83],"semantically":[84],"enriched":[85],"AD.":[86],"Textual":[87],"descriptions":[88],"are":[89],"synthesized":[90],"into":[91],"natural-sounding":[92],"speech":[93],"using":[94,122],"state-of-the-art":[95,199],"text-to-speech":[96],"(TTS)":[97],"techniques":[98],"immersive":[101],"experience.":[102],"A":[103],"core":[104],"contribution":[105],"of":[106,177,189,209],"the":[107,111,169,175,178,201,210,218],"methodology":[109],"involves":[110],"tracking-based":[112],"recognition":[114],"module,":[115],"which":[116,181],"ensures":[117],"temporally":[118],"consistent":[119],"identification":[121],"attention":[126,249],"mechanism.":[127],"approach":[129],"mitigates":[130],"inconsistencies":[131],"from":[132],"motion":[133],"blur,":[134],"occlusions,":[135],"scale":[137],"variations,":[138],"improving":[139],"referential":[140],"continuity.":[141],"Additionally,":[142],"EARS4SEE":[143,263],"introduces":[144],"automated":[146],"pipeline,":[150],"capturing":[151],"long-range":[152],"dependencies":[154],"improve":[156],"scene":[157],"boundary":[158],"detection":[159],"alignment.":[162],"experimental":[164],"evaluation":[165],"carried":[166],"out":[167],"on":[168,269],"MAD-Eval-Named":[170],"TV-AD":[172],"datasets":[173],"validates":[174],"effectiveness":[176],"methodology,":[180],"leads":[182],"average":[184],"CIDEr":[185],"LLM-AD-eval":[187,227],"scores":[188],"24.1":[190],"3.02,":[192],"respectively.":[193],"addition,":[195],"when":[196],"compared":[197],"techniques,":[200],"architecture":[203],"shows":[204],"superior":[205],"performances":[206],"in":[207,214,217,226],"terms":[208],"CIDEr,":[211],"gains":[213],"accuracy":[215],"ranging":[216],"[1.72%,":[219],"10.2%]":[220],"interval":[221],"8%":[224],"increase":[225],"scores.":[228],"\u2022":[229,237,246,254,262],"Scene-aware,":[230],"training-free":[231,266],"long-form":[234],"description.":[236],"Multimodal":[238],"captures":[241],"context":[242],"across":[243,260],"multiple":[244],"shots.":[245,261],"Adaptive":[247],"improves":[250,257],"identity":[252],"consistency.":[253],"Memory-guided":[255],"prompting":[256],"outperforms":[264],"baselines":[268],"two":[270],"benchmarks.":[271]},"counts_by_year":[],"updated_date":"2026-04-28T06:04:28.489925","created_date":"2026-04-26T00:00:00"}
