{"id":"https://openalex.org/W4389890864","doi":"https://doi.org/10.1109/tpami.2023.3343736","title":"Enhancing Visual Grounding in Vision-Language Pre-Training With Position-Guided Text Prompts","display_name":"Enhancing Visual Grounding in Vision-Language Pre-Training With Position-Guided Text Prompts","publication_year":2023,"publication_date":"2023-12-18","ids":{"openalex":"https://openalex.org/W4389890864","doi":"https://doi.org/10.1109/tpami.2023.3343736","pmid":"https://pubmed.ncbi.nlm.nih.gov/38109234"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2023.3343736","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2023.3343736","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://ink.library.smu.edu.sg/sis_research/8742","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100376310","display_name":"Jinpeng Wang","orcid":"https://orcid.org/0000-0001-6127-9146"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Alex Jinpeng Wang","raw_affiliation_strings":["Show Lab, National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Show Lab, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010883708","display_name":"Pan Zhou","orcid":"https://orcid.org/0000-0003-3400-8943"},"institutions":[{"id":"https://openalex.org/I79891267","display_name":"Singapore Management University","ror":"https://ror.org/050qmg959","country_code":"SG","type":"education","lineage":["https://openalex.org/I79891267"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Pan Zhou","raw_affiliation_strings":["School of Computing and Information Systems, Singapore Management University, Singapore"],"affiliations":[{"raw_affiliation_string":"School of Computing and Information Systems, Singapore Management University, Singapore","institution_ids":["https://openalex.org/I79891267"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068937750","display_name":"Mike Zheng Shou","orcid":"https://orcid.org/0000-0002-7681-2166"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Mike Zheng Shou","raw_affiliation_strings":["Show Lab, National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Show Lab, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100381753","display_name":"Shuicheng Yan","orcid":"https://orcid.org/0000-0001-8906-3777"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shuicheng Yan","raw_affiliation_strings":["Sea AI Lab, Singapore"],"affiliations":[{"raw_affiliation_string":"Sea AI Lab, Singapore","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100376310"],"corresponding_institution_ids":["https://openalex.org/I165932596"],"apc_list":null,"apc_paid":null,"fwci":0.959,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.78283514,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"46","issue":"5","first_page":"3406","last_page":"3421"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.989799976348877,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6663394570350647},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6338658332824707},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.5512906312942505},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.5321385860443115},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.4967513680458069},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4535471200942993},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.42805713415145874},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.4232352375984192},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.39568957686424255},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3655053377151489},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3369753062725067},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.1502678394317627},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09122076630592346}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6663394570350647},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6338658332824707},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.5512906312942505},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.5321385860443115},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.4967513680458069},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4535471200942993},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.42805713415145874},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.4232352375984192},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.39568957686424255},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3655053377151489},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3369753062725067},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.1502678394317627},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09122076630592346},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/tpami.2023.3343736","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2023.3343736","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:38109234","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/38109234","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null},{"id":"pmh:oai:ink.library.smu.edu.sg:sis_research-9745","is_oa":true,"landing_page_url":"https://ink.library.smu.edu.sg/sis_research/8742","pdf_url":null,"source":{"id":"https://openalex.org/S4306401925","display_name":"Singapore Management University Institutional Knowledge (InK) (Singapore Management University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I79891267","host_organization_name":"Singapore Management University","host_organization_lineage":["https://openalex.org/I79891267"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"https://doi.org/10.1109/TPAMI.2023.3343736","raw_type":"Journal Article"}],"best_oa_location":{"id":"pmh:oai:ink.library.smu.edu.sg:sis_research-9745","is_oa":true,"landing_page_url":"https://ink.library.smu.edu.sg/sis_research/8742","pdf_url":null,"source":{"id":"https://openalex.org/S4306401925","display_name":"Singapore Management University Institutional Knowledge (InK) (Singapore Management University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I79891267","host_organization_name":"Singapore Management University","host_organization_lineage":["https://openalex.org/I79891267"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"https://doi.org/10.1109/TPAMI.2023.3343736","raw_type":"Journal Article"},"sustainable_development_goals":[{"score":0.800000011920929,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G3968842904","display_name":null,"funder_award_id":"NRF-NRFF13-2021-0008","funder_id":"https://openalex.org/F4320320709","funder_display_name":"National Research Foundation Singapore"}],"funders":[{"id":"https://openalex.org/F4320320709","display_name":"National Research Foundation Singapore","ror":"https://ror.org/03cpyc314"},{"id":"https://openalex.org/F4320320751","display_name":"Ministry of Education - Singapore","ror":"https://ror.org/01kcva023"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":87,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W1861492603","https://openalex.org/W1889081078","https://openalex.org/W1933349210","https://openalex.org/W2277195237","https://openalex.org/W2463955103","https://openalex.org/W2489434015","https://openalex.org/W2606982687","https://openalex.org/W2745461083","https://openalex.org/W2765716052","https://openalex.org/W2886641317","https://openalex.org/W2896457183","https://openalex.org/W2904565150","https://openalex.org/W2963530300","https://openalex.org/W2969876226","https://openalex.org/W2975501350","https://openalex.org/W2998356391","https://openalex.org/W3001555892","https://openalex.org/W3034636873","https://openalex.org/W3035265375","https://openalex.org/W3035635319","https://openalex.org/W3035682985","https://openalex.org/W3091588028","https://openalex.org/W3094502228","https://openalex.org/W3104279398","https://openalex.org/W3168640669","https://openalex.org/W3173220247","https://openalex.org/W3176641147","https://openalex.org/W3177224328","https://openalex.org/W3184784418","https://openalex.org/W3185341429","https://openalex.org/W3193402170","https://openalex.org/W3193862643","https://openalex.org/W3204588463","https://openalex.org/W3212456749","https://openalex.org/W3214685499","https://openalex.org/W3215626407","https://openalex.org/W3217102353","https://openalex.org/W4225323055","https://openalex.org/W4226182655","https://openalex.org/W4229005866","https://openalex.org/W4229042118","https://openalex.org/W4288089799","https://openalex.org/W4312563428","https://openalex.org/W4312922092","https://openalex.org/W4312956471","https://openalex.org/W4312960937","https://openalex.org/W4320458302","https://openalex.org/W4322718191","https://openalex.org/W4366197770","https://openalex.org/W4367365797","https://openalex.org/W4385970122","https://openalex.org/W4386065353","https://openalex.org/W4386071547","https://openalex.org/W4386071687","https://openalex.org/W4386076661","https://openalex.org/W4391451889","https://openalex.org/W6639432524","https://openalex.org/W6676497082","https://openalex.org/W6755207826","https://openalex.org/W6766904570","https://openalex.org/W6766978945","https://openalex.org/W6767211374","https://openalex.org/W6768438993","https://openalex.org/W6769627184","https://openalex.org/W6773248631","https://openalex.org/W6778883912","https://openalex.org/W6779473860","https://openalex.org/W6784333009","https://openalex.org/W6789753369","https://openalex.org/W6790019176","https://openalex.org/W6791353385","https://openalex.org/W6798805250","https://openalex.org/W6800139874","https://openalex.org/W6803872405","https://openalex.org/W6803953248","https://openalex.org/W6804095316","https://openalex.org/W6810334672","https://openalex.org/W6811013733","https://openalex.org/W6811072154","https://openalex.org/W6811340617","https://openalex.org/W6846007759","https://openalex.org/W6850204008","https://openalex.org/W6850625674","https://openalex.org/W6851938174","https://openalex.org/W6852164222","https://openalex.org/W6855815363"],"related_works":["https://openalex.org/W2021787609","https://openalex.org/W2097328689","https://openalex.org/W4234899305","https://openalex.org/W1537063595","https://openalex.org/W2379604501","https://openalex.org/W2373854414","https://openalex.org/W2522183581","https://openalex.org/W2954371137","https://openalex.org/W2120744156","https://openalex.org/W1940530101"],"abstract_inverted_index":{"Vision-Language":[0],"Pre-Training":[1],"(VLP)":[2],"has":[3,144],"demonstrated":[4],"remarkable":[5],"potential":[6],"in":[7,34,118,135,213,223],"aligning":[8],"image":[9,83],"and":[10,39,89,203,219,238],"text":[11],"pairs,":[12],"paving":[13],"the":[14,66,77,104,113,123,150,168,176,226],"way":[15],"for":[16,45,216,225],"a":[17,57,91,109,126,136,145,239],"wide":[18],"range":[19],"of":[20,36,70,125,154,180],"cross-modal":[21,71,199],"learning":[22,200],"tasks.":[23,164],"Nevertheless,":[24],"we":[25,55,166],"have":[26],"observed":[27],"that":[28],"VLP":[29,78,155,190],"models":[30,72],"often":[31],"fall":[32],"short":[33],"terms":[35],"visual":[37,51,67,105,151,177],"grounding":[38,68,106,152,178],"localization":[40],"capabilities,":[41],"which":[42],"are":[43],"crucial":[44],"many":[46],"downstream":[47,163],"tasks,":[48],"such":[49,139,206],"as":[50,108,140,207,243],"reasoning.":[52],"In":[53,76],"response,":[54],"introduce":[56],"novel":[58],"Position-guided":[59],"Text":[60],"Prompt":[61],"(PTP)":[62],"paradigm":[63],"to":[64,95,115,159,173,193],"bolster":[65],"abilities":[69],"trained":[73],"with":[74,235],"VLP.":[75],"phase,":[79],"PTP":[80,101,137,183,186,231],"divides":[81],"an":[82],"into":[84,187],"N":[85,87],"x":[86],"blocks":[88,120,124],"employs":[90],"widely-used":[92],"object":[93,247],"detector":[94,248],"identify":[96],"objects":[97,117,172],"within":[98],"each":[99],"block.":[100],"then":[102],"reframes":[103],"task":[107],"fill-in-the-blank":[110],"problem,":[111],"encouraging":[112],"model":[114,201],"predict":[116],"given":[119,127],"or":[121,133],"regress":[122],"object,":[128],"exemplified":[129],"by":[130],"filling":[131],"\"[P]\"":[132],"\"[O]\"":[134],"sentence":[138],"\"The":[141],"block":[142],"[P]":[143],"[O].\"":[146],"This":[147],"strategy":[148],"enhances":[149],"capabilities":[153,179],"models,":[156],"enabling":[157],"them":[158],"better":[160],"tackle":[161],"various":[162],"Additionally,":[165],"integrate":[167],"seconda-order":[169],"relationships":[170],"between":[171],"further":[174],"enhance":[175],"our":[181],"proposed":[182],"paradigm.":[184],"Incorporating":[185],"several":[188],"state-of-the-art":[189,227],"frameworks":[191],"leads":[192],"consistently":[194],"significant":[195],"improvements":[196],"across":[197],"representative":[198],"architectures":[202],"multiple":[204],"benchmarks,":[205],"zero-shot":[208],"Flickr30":[209],"k":[210],"Retrieval":[211],"(+5.6":[212],"average":[214],"recall@1)":[215],"ViLT":[217],"baseline,":[218],"COCO":[220],"Captioning":[221],"(+5.5":[222],"CIDEr)":[224],"BLIP":[228],"baseline.":[229],"Furthermore,":[230],"attains":[232],"comparable":[233],"results":[234],"object-detector-based":[236],"methods":[237],"faster":[240],"inference":[241],"speed,":[242],"it":[244],"discards":[245],"its":[246],"during":[249],"inference,":[250],"unlike":[251],"other":[252],"approaches.":[253]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":5}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
