{"id":"https://openalex.org/W4406961395","doi":"https://doi.org/10.48550/arxiv.2501.17131","title":"Scenario Understanding of Traffic Scenes Through Large Visual Language Models","display_name":"Scenario Understanding of Traffic Scenes Through Large Visual Language Models","publication_year":2025,"publication_date":"2025-01-28","ids":{"openalex":"https://openalex.org/W4406961395","doi":"https://doi.org/10.48550/arxiv.2501.17131"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2501.17131","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2501.17131","pdf_url":"https://arxiv.org/pdf/2501.17131","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2501.17131","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5116078373","display_name":"Rivera Esteban","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Rivera, Esteban","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116078374","display_name":"L\u00fcbberstedt Jannik","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"L\u00fcbberstedt, Jannik","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092797532","display_name":"Nico Uhlemann","orcid":"https://orcid.org/0009-0006-8774-7888"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Uhlemann, Nico","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5079718896","display_name":"Markus Lienkamp","orcid":"https://orcid.org/0000-0002-9263-5323"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lienkamp, Markus","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5116078373"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9764000177383423,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9764000177383423,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9003000259399414,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6289052367210388},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.38207682967185974}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6289052367210388},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38207682967185974}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2501.17131","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2501.17131","pdf_url":"https://arxiv.org/pdf/2501.17131","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2501.17131","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2501.17131","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2501.17131","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2501.17131","pdf_url":"https://arxiv.org/pdf/2501.17131","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4406961395.pdf","grobid_xml":"https://content.openalex.org/works/W4406961395.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Deep":[0],"learning":[1],"models":[2],"for":[3,84,158],"autonomous":[4,162],"driving,":[5],"encompassing":[6],"perception,":[7],"planning,":[8],"and":[9,52,75,98,102,112,150],"control,":[10],"depend":[11],"on":[12,107,129],"vast":[13],"datasets":[14],"to":[15,26,38,100,145],"achieve":[16],"their":[17,21,40,152],"high":[18],"performance.":[19],"However,":[20],"generalization":[22],"often":[23,80],"suffers":[24],"due":[25],"domain-specific":[27],"data":[28,59],"distributions,":[29],"making":[30],"an":[31,109,155],"effective":[32],"scene-based":[33],"categorization":[34,76],"of":[35,94,143],"samples":[36],"necessary":[37],"improve":[39],"reliability":[41],"across":[42],"diverse":[43],"domains.":[44],"Manual":[45],"captioning,":[46],"though":[47],"valuable,":[48],"is":[49],"both":[50,108],"labor-intensive":[51],"time-consuming,":[53],"creating":[54],"a":[55,68,117,126],"bottleneck":[56],"in":[57,161],"the":[58,92,113,141],"annotation":[60],"process.":[61],"Large":[62],"Visual":[63],"Language":[64],"Models":[65],"(LVLMs)":[66],"present":[67],"compelling":[69],"solution":[70],"by":[71],"automating":[72],"image":[73],"analysis":[74],"through":[77],"contextual":[78],"queries,":[79],"without":[81],"requiring":[82],"retraining":[83],"new":[85,130],"categories.":[86],"In":[87],"this":[88],"study,":[89],"we":[90],"evaluate":[91],"capabilities":[93],"LVLMs,":[95],"including":[96],"GPT-4":[97],"LLaVA,":[99],"understand":[101,146],"classify":[103],"urban":[104,147],"traffic":[105,148],"scenes":[106],"in-house":[110],"dataset":[111],"BDD100K.":[114],"We":[115],"propose":[116],"scalable":[118],"captioning":[119],"pipeline":[120],"that":[121],"integrates":[122],"state-of-the-art":[123],"models,":[124],"enabling":[125],"flexible":[127],"deployment":[128],"datasets.":[131],"Our":[132],"analysis,":[133],"combining":[134],"quantitative":[135],"metrics":[136],"with":[137],"qualitative":[138],"insights,":[139],"demonstrates":[140],"effectiveness":[142],"LVLMs":[144],"scenarios":[149],"highlights":[151],"potential":[153],"as":[154],"efficient":[156],"tool":[157],"data-driven":[159],"advancements":[160],"driving.":[163]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-01-30T00:00:00"}
