{"id":"https://openalex.org/W7125127910","doi":"https://doi.org/10.1109/tmm.2026.3654372","title":"Visual Position Prompt for MLLM Based Visual Grounding","display_name":"Visual Position Prompt for MLLM Based Visual Grounding","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7125127910","doi":"https://doi.org/10.1109/tmm.2026.3654372"},"language":null,"primary_location":{"id":"doi:10.1109/tmm.2026.3654372","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2026.3654372","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5026824225","display_name":"Wenda Tang","orcid":"https://orcid.org/0000-0001-6684-4642"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Tang","raw_affiliation_strings":["School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0003-3414-2421","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101812029","display_name":"Yanpeng Sun","orcid":"https://orcid.org/0000-0001-6249-5596"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanpeng Sun","raw_affiliation_strings":["School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0001-6249-5596","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005963007","display_name":"Qinying Gu","orcid":"https://orcid.org/0000-0002-8863-0810"},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qinying Gu","raw_affiliation_strings":["Shanghai Artificial Intelligence Laboratory, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-8863-0810","affiliations":[{"raw_affiliation_string":"Shanghai Artificial Intelligence Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4391012619"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017096005","display_name":"Zechao Li","orcid":"https://orcid.org/0000-0002-5341-5985"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zechao Li","raw_affiliation_strings":["School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0002-5341-5985","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I36399199"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":17.6924,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.97134996,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":"28","issue":null,"first_page":"3739","last_page":"3754"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8828999996185303,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8828999996185303,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.021199999377131462,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.016599999740719795,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5856999754905701},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5637000203132629},{"id":"https://openalex.org/keywords/spatial-contextual-awareness","display_name":"Spatial contextual awareness","score":0.5389999747276306},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.5307999849319458},{"id":"https://openalex.org/keywords/position","display_name":"Position (finance)","score":0.5170999765396118},{"id":"https://openalex.org/keywords/overlay","display_name":"Overlay","score":0.4884999990463257},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.48100000619888306},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4763000011444092},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.45419999957084656}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8751999735832214},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6554999947547913},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5856999754905701},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5637000203132629},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.5389999747276306},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.5307999849319458},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.5170999765396118},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.49799999594688416},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.4884999990463257},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.48100000619888306},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4763000011444092},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.45419999957084656},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.38670000433921814},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.38609999418258667},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.35359999537467957},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.31209999322891235},{"id":"https://openalex.org/C207363949","wikidata":"https://www.wikidata.org/wiki/Q462915","display_name":"Visual space","level":3,"score":0.29899999499320984},{"id":"https://openalex.org/C3004257","wikidata":"https://www.wikidata.org/wiki/Q17084606","display_name":"Correspondence problem","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C135798126","wikidata":"https://www.wikidata.org/wiki/Q2167279","display_name":"Top-down and bottom-up design","level":2,"score":0.2842000126838684},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.2822999954223633},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.27730000019073486},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.25870001316070557},{"id":"https://openalex.org/C194226119","wikidata":"https://www.wikidata.org/wiki/Q161779","display_name":"Spatial reference system","level":2,"score":0.25609999895095825},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.25540000200271606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2026.3654372","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2026.3654372","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6875016093254089,"id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G2531279109","display_name":null,"funder_award_id":"62425603","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6750246608","display_name":null,"funder_award_id":"BK20240011","funder_id":"https://openalex.org/F4320334982","funder_display_name":"Basic Research Program of Jiangsu Province"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320334982","display_name":"Basic Research Program of Jiangsu Province","ror":null},{"id":"https://openalex.org/F7351031327","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Although":[0],"Multimodal":[1],"Large":[2],"Language":[3],"Models":[4],"(MLLMs)":[5],"excel":[6],"at":[7],"various":[8],"image-related":[9],"tasks,":[10],"they":[11],"encounter":[12],"challenges":[13],"in":[14,24,148,170],"precisely":[15],"aligning":[16],"coordinates":[17],"with":[18,51,82,130],"spatial":[19,42,65,112,131],"information":[20],"within":[21],"images,":[22],"particularly":[23],"position-aware":[25,119],"tasks":[26],"such":[27],"as":[28],"visual":[29,144,190],"grounding.":[30],"This":[31],"limitation":[32],"arises":[33],"from":[34],"two":[35,94],"key":[36],"factors.":[37],"First,":[38],"MLLMs":[39,165],"lack":[40],"explicit":[41],"references,":[43],"making":[44],"it":[45,152],"difficult":[46],"to":[47,68,87,109,121,199],"associate":[48],"textual":[49],"descriptions":[50],"precise":[52],"image":[53,108],"locations.":[54],"Second,":[55],"their":[56],"feature":[57],"extraction":[58],"processes":[59],"prioritize":[60],"global":[61,98],"context":[62],"over":[63],"fine-grained":[64,123],"details,":[66],"leading":[67],"weak":[69],"localization":[70],"capability.":[71,91],"To":[72,125],"address":[73],"these":[74],"issues,":[75],"we":[76,133],"introduce":[77,135],"VPP-LLaVA,":[78,182],"an":[79],"MLLM":[80],"enhanced":[81],"Visual":[83],"Position":[84],"Prompt":[85],"(VPP)":[86],"improve":[88],"its":[89],"grounding":[90,145,191],"VPP-LLaVA":[92],"integrates":[93],"complementary":[95],"mechanisms:":[96],"the":[97,106,115],"VPP":[99,117],"overlays":[100],"a":[101,137,149,175],"learnable,":[102],"axis-like":[103],"tensor":[104],"onto":[105],"input":[107],"provide":[110],"structured":[111],"cues,":[113],"while":[114],"local":[116],"incorporates":[118],"queries":[120],"support":[122],"localization.":[124],"effectively":[126],"train":[127],"our":[128],"model":[129],"guidance,":[132],"further":[134],"VPP-SFT,":[136],"curated":[138],"dataset":[139],"of":[140],"0.6":[141],"M":[142,168],"high-quality":[143],"samples.":[146],"Designed":[147],"compact":[150],"format,":[151],"enables":[153],"efficient":[154],"training":[155],"and":[156],"is":[157],"significantly":[158],"smaller":[159],"than":[160],"datasets":[161],"used":[162],"by":[163],"other":[164],"(e.g.,":[166],"21":[167],"samples":[169],"MiniGPT-v2),":[171],"yet":[172],"still":[173],"provides":[174],"strong":[176,196],"performance":[177],"boost.":[178],"The":[179],"resulting":[180],"model,":[181],"not":[183],"only":[184],"achieves":[185],"state-of-the-art":[186],"results":[187],"on":[188],"standard":[189],"benchmarks":[192],"but":[193],"also":[194],"demonstrates":[195],"zero-shot":[197],"generalization":[198],"challenging":[200],"unseen":[201],"datasets.":[202]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-22T00:00:00"}
