{"id":"https://openalex.org/W4400645329","doi":"https://doi.org/10.1109/tcsvt.2024.3428487","title":"Multi-Modal Large Language Model Enhanced Pseudo 3D Perception Framework for Visual Commonsense Reasoning","display_name":"Multi-Modal Large Language Model Enhanced Pseudo 3D Perception Framework for Visual Commonsense Reasoning","publication_year":2024,"publication_date":"2024-07-15","ids":{"openalex":"https://openalex.org/W4400645329","doi":"https://doi.org/10.1109/tcsvt.2024.3428487"},"language":"en","primary_location":{"id":"doi:10.1109/tcsvt.2024.3428487","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2024.3428487","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101670573","display_name":"Jian Zhu","orcid":"https://orcid.org/0000-0002-3835-4627"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jian Zhu","raw_affiliation_strings":["Department of Computer Science and Technology and the Key Laboratory of Embedded System and Service Computing (Ministry of Education), Tongji University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology and the Key Laboratory of Embedded System and Service Computing (Ministry of Education), Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058982350","display_name":"Hanli Wang","orcid":"https://orcid.org/0000-0002-9999-4871"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hanli Wang","raw_affiliation_strings":["Department of Computer Science and Technology and the Key Laboratory of Embedded System and Service Computing (Ministry of Education), Tongji University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology and the Key Laboratory of Embedded System and Service Computing (Ministry of Education), Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101675323","display_name":"Miaojing Shi","orcid":"https://orcid.org/0000-0002-4933-0073"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Miaojing Shi","raw_affiliation_strings":["College of Electronic and Information Engineering, Tongji University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"College of Electronic and Information Engineering, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101670573"],"corresponding_institution_ids":["https://openalex.org/I116953780"],"apc_list":null,"apc_paid":null,"fwci":0.7895,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.72028599,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":"34","issue":"11","first_page":"11682","last_page":"11694"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.984000027179718,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.98089998960495,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/commonsense-reasoning","display_name":"Commonsense reasoning","score":0.8542193174362183},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6995826363563538},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6188791990280151},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5853608846664429},{"id":"https://openalex.org/keywords/commonsense-knowledge","display_name":"Commonsense knowledge","score":0.5706024169921875},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5061549544334412},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4938731789588928},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4127506613731384},{"id":"https://openalex.org/keywords/knowledge-representation-and-reasoning","display_name":"Knowledge representation and reasoning","score":0.19545236229896545},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.15335750579833984}],"concepts":[{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.8542193174362183},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6995826363563538},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6188791990280151},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5853608846664429},{"id":"https://openalex.org/C30542707","wikidata":"https://www.wikidata.org/wiki/Q1603203","display_name":"Commonsense knowledge","level":3,"score":0.5706024169921875},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5061549544334412},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4938731789588928},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4127506613731384},{"id":"https://openalex.org/C161301231","wikidata":"https://www.wikidata.org/wiki/Q3478658","display_name":"Knowledge representation and reasoning","level":2,"score":0.19545236229896545},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.15335750579833984},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsvt.2024.3428487","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2024.3428487","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.41999998688697815}],"awards":[{"id":"https://openalex.org/G1300661389","display_name":null,"funder_award_id":"2021SHZDZX0100","funder_id":"https://openalex.org/F4320335480","funder_display_name":"Guangzhou Municipal Science and Technology Project"},{"id":"https://openalex.org/G2125127483","display_name":null,"funder_award_id":"62371343","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335480","display_name":"Guangzhou Municipal Science and Technology Project","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W2194775991","https://openalex.org/W2471094925","https://openalex.org/W2608787653","https://openalex.org/W2745461083","https://openalex.org/W2896457183","https://openalex.org/W2963115613","https://openalex.org/W2963717374","https://openalex.org/W2968124245","https://openalex.org/W2976021265","https://openalex.org/W3021007069","https://openalex.org/W3035651653","https://openalex.org/W3090449556","https://openalex.org/W3099023595","https://openalex.org/W3160447289","https://openalex.org/W3174690404","https://openalex.org/W3176896951","https://openalex.org/W3177224328","https://openalex.org/W3185066916","https://openalex.org/W3206218097","https://openalex.org/W4214520160","https://openalex.org/W4226396383","https://openalex.org/W4285118488","https://openalex.org/W4296704971","https://openalex.org/W4308234195","https://openalex.org/W4312377093","https://openalex.org/W4312911675","https://openalex.org/W4313054169","https://openalex.org/W4318718936","https://openalex.org/W4366330503","https://openalex.org/W4366850747","https://openalex.org/W4376481269","https://openalex.org/W4377971516","https://openalex.org/W4383337770","https://openalex.org/W4385245566","https://openalex.org/W4391557963","https://openalex.org/W6631190155","https://openalex.org/W6728881024","https://openalex.org/W6767194493","https://openalex.org/W6767211374","https://openalex.org/W6767362881","https://openalex.org/W6769264886","https://openalex.org/W6849177959","https://openalex.org/W6851592950","https://openalex.org/W6851950068"],"related_works":["https://openalex.org/W3035583586","https://openalex.org/W4320165839","https://openalex.org/W2151799802","https://openalex.org/W4385488510","https://openalex.org/W2196562041","https://openalex.org/W2073302931","https://openalex.org/W3206107299","https://openalex.org/W3082691151","https://openalex.org/W4287633646","https://openalex.org/W4378501473"],"abstract_inverted_index":{"The":[0,278],"visual":[1,65,94,170,184,229],"commonsense":[2],"reasoning":[3,91,261],"(VCR)":[4],"task":[5],"is":[6,111,124,151,191,214,248],"to":[7,59,126,139,153,165,197,216,237,250],"choose":[8],"an":[9,104,212],"answer":[10,179,190,202],"and":[11,21,31,63,131,204,219,221],"provide":[12],"a":[13,51,148,194,244],"justifying":[14],"rationale":[15],"based":[16,258],"on":[17,92,259,264],"the":[18,102,120,160,178,181,189,207,222,253,265,269,272],"given":[19],"image":[20],"textural":[22],"question.":[23],"Representative":[24],"works":[25],"first":[26,117],"recognize":[27],"objects":[28,49,62,95,123,144,158,168,230],"in":[29,38,50,129,145,188,225,287],"images":[30,218],"then":[32],"associate":[33,167,177],"them":[34,57],"with":[35,169,180,193,233],"key":[36],"words":[37,203],"texts.":[39,98],"However,":[40],"existing":[41],"approaches":[42],"do":[43],"not":[44,84],"consider":[45,252],"exact":[46],"positions":[47,142],"of":[48,101,143,163,183,255,271,281],"human-like":[52],"three-dimensional":[53],"(3D)":[54],"manner,":[55],"making":[56],"incompetent":[58],"accurately":[60],"distinguish":[61],"understand":[64],"relation.":[66],"Recently,":[67],"multi-modal":[68,81],"large":[69],"language":[70],"models":[71],"(MLLMs)":[72],"have":[73],"been":[74],"used":[75],"as":[76,211,239],"powerful":[77],"tools":[78],"for":[79,85,113],"several":[80],"tasks":[82],"but":[83],"VCR":[86,137,266],"yet,":[87],"which":[88],"requires":[89],"elaborate":[90],"specific":[93,228],"referred":[96],"by":[97,173],"In":[99],"light":[100],"above,":[103],"MLLM":[105,213,241],"enhanced":[106],"pseudo":[107,195],"3D":[108,141],"perception":[109],"framework":[110,274],"designed":[112],"VCR.":[114],"Specifically,":[115],"we":[116],"demonstrate":[118,268],"that":[119],"relation":[121],"between":[122,157,201],"relevant":[125],"object":[127,134,235],"depths":[128],"images,":[130],"hence":[132],"introduce":[133],"depth":[135,155,182,196],"into":[136,159],"frameworks":[138],"infer":[140],"images.":[146],"Then,":[147],"depth-aware":[149,199],"Transformer":[150,164],"proposed":[152,273],"encode":[154],"differences":[156],"attention":[161],"mechanism":[162],"discriminatively":[166],"scenes":[171],"guided":[172],"depth.":[174],"To":[175],"further":[176],"scene,":[185],"each":[186],"word":[187],"tagged":[192],"realize":[198],"association":[200],"objects.":[205],"On":[206],"other":[208],"hand,":[209],"BLIP-2":[210],"employed":[215],"process":[217],"texts,":[220],"referring":[223],"expressions":[224],"texts":[226],"involving":[227],"are":[231],"modified":[232],"linguistic":[234],"labels":[236],"serve":[238],"comprehensible":[240],"inputs.":[242],"Finally,":[243],"parameter":[245],"optimization":[246],"technique":[247],"devised":[249],"fully":[251],"quality":[254],"data":[256],"batches":[257],"multi-level":[260],"confidence.":[262],"Experiments":[263],"dataset":[267],"superiority":[270],"over":[275],"state-of-the-art":[276],"approaches.":[277],"source":[279],"code":[280],"this":[282],"work":[283],"can":[284],"be":[285],"found":[286],"<uri":[288],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[289],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">https://mic.tongji.edu.cn</uri>.":[290]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1}],"updated_date":"2025-12-26T23:08:49.675405","created_date":"2025-10-10T00:00:00"}
