{"id":"https://openalex.org/W7131104426","doi":"https://doi.org/10.1109/iccvw69036.2025.00560","title":"TrafficVILA: Scaling Vision-Language Models to High-Resolution Video Understanding for Traffic Safety Analysis","display_name":"TrafficVILA: Scaling Vision-Language Models to High-Resolution Video Understanding for Traffic Safety Analysis","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W7131104426","doi":"https://doi.org/10.1109/iccvw69036.2025.00560"},"language":null,"primary_location":{"id":"doi:10.1109/iccvw69036.2025.00560","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccvw69036.2025.00560","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision Workshops (ICCVW)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006504198","display_name":"Zaid Pervaiz Bhat","orcid":"https://orcid.org/0000-0002-8331-3154"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zaid Pervaiz Bhat","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074975001","display_name":"Seunghwan Cha","orcid":"https://orcid.org/0009-0007-6033-2030"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Seunghwan Cha","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126637441","display_name":"Rohan Gulati","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rohan Gulati","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056608074","display_name":"Monika Jhuria","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Monika Jhuria","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5117649542","display_name":"Varun Praveen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Varun Praveen","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068645922","display_name":"Tomasz Kornuta","orcid":"https://orcid.org/0000-0002-5173-2696"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tomasz Kornuta","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126590747","display_name":"Yao Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao Lu","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5111912159","display_name":"Vidya N. Murali","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vidya Murali","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5006504198"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.7007,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.77611671,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"5366","last_page":"5373"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11099","display_name":"Autonomous Vehicle Technology and Safety","score":0.33489999175071716,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11099","display_name":"Autonomous Vehicle Technology and Safety","score":0.33489999175071716,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.11550000309944153,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.08810000121593475,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/minimum-bounding-box","display_name":"Minimum bounding box","score":0.6366000175476074},{"id":"https://openalex.org/keywords/overlay","display_name":"Overlay","score":0.5679000020027161},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4921000003814697},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.4909000098705292},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.47450000047683716},{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.47290000319480896},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.4699999988079071},{"id":"https://openalex.org/keywords/bounding-overwatch","display_name":"Bounding overwatch","score":0.39149999618530273}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7928000092506409},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.6366000175476074},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.5679000020027161},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.510699987411499},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4921000003814697},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4909000098705292},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.47450000047683716},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.47290000319480896},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4699999988079071},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4569000005722046},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.42170000076293945},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.39149999618530273},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.364300012588501},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.32710000872612},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.32409998774528503},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.3228999972343445},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.3147999942302704},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3005000054836273},{"id":"https://openalex.org/C89992363","wikidata":"https://www.wikidata.org/wiki/Q5961558","display_name":"Track (disk drive)","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2896000146865845},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.2721000015735626},{"id":"https://openalex.org/C183469790","wikidata":"https://www.wikidata.org/wiki/Q333501","display_name":"Crash","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C2778029271","wikidata":"https://www.wikidata.org/wiki/Q5421931","display_name":"Extension (predicate logic)","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.25870001316070557},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.2581000030040741}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iccvw69036.2025.00560","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccvw69036.2025.00560","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision Workshops (ICCVW)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":10,"referenced_works":["https://openalex.org/W1956340063","https://openalex.org/W2101105183","https://openalex.org/W2964089981","https://openalex.org/W3035564946","https://openalex.org/W4402716166","https://openalex.org/W4402727885","https://openalex.org/W4402904107","https://openalex.org/W4404781340","https://openalex.org/W4413146669","https://openalex.org/W7131078821"],"related_works":[],"abstract_inverted_index":{"Traffic":[0],"scene":[1],"understanding":[2],"from":[3],"surveillance":[4],"video":[5,28,50,73],"demands":[6],"the":[7,38,121,127,131],"detection":[8],"of":[9,43,130,142],"fine-grained":[10],"details":[11,54],"often":[12],"overlooked":[13],"by":[14,161],"standard":[15],"Vision":[16],"Language":[17],"Models":[18],"(VLMs).":[19],"We":[20,117],"introduce":[21],"TrafficVila,":[22],"a":[23,83,140],"system":[24],"designed":[25],"for":[26,78,102],"high-resolution":[27],"analysis":[29],"in":[30,126],"traffic":[31],"safety":[32],"applications.":[33],"At":[34],"its":[35],"core":[36],"is":[37],"NVILA-15B-HRL":[39],"model,":[40],"an":[41],"extension":[42],"NVILA":[44],"that":[45,88,147],"applies":[46],"dynamic":[47],"tiling":[48],"to":[49,92],"inputs,":[51],"capturing":[52],"critical":[53],"using":[55,76],"six":[56],"tiles":[57],"per":[58],"frame":[59],"with":[60,68,139],"temporal":[61,115],"localization.":[62],"TrafficVILA":[63,119],"builds":[64],"on":[65,120],"this":[66],"model":[67],"three":[69,129],"key":[70],"components:":[71],"(1)":[72],"Set-of-Mark":[74],"prompting":[75],"SAM2":[77],"accurate":[79],"object":[80],"tracking,":[81],"(2)":[82],"LLM-based":[84],"fact":[85,155],"checking":[86,156],"pipeline":[87],"leverages":[89],"MCQ":[90],"predictions":[91],"reduce":[93],"hallu-cinations,":[94],"and":[95,99,113,124,154],"(3)":[96],"intelligent":[97],"view":[98],"phase":[100],"selection":[101],"multi-perspective":[103],"datasets.":[104],"This":[105],"integrated":[106],"design":[107],"enables":[108],"both":[109],"fine":[110],"spatial":[111],"resolution":[112],"robust":[114],"reasoning.":[116],"applied":[118],"WTS":[122],"dataset,":[123],"ranked":[125],"top":[128],"2025":[132],"AI":[133],"City":[134],"Challenge":[135],"Track":[136],"2":[137],"leaderboard":[138],"score":[141],"58.85.":[143],"Ablation":[144],"studies":[145],"show":[146],"bounding":[148],"box":[149],"overlays":[150],"outperform":[151],"segmentation":[152],"masks,":[153],"significantly":[157],"improves":[158],"caption":[159],"accuracy":[160],"mitigating":[162],"hallucinations.":[163]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-02-25T06:17:34.324206","created_date":"2026-02-24T00:00:00"}
