{"id":"https://openalex.org/W4399026183","doi":"https://doi.org/10.48550/arxiv.2405.13949","title":"PitVQA: Image-grounded Text Embedding LLM for Visual Question Answering in Pituitary Surgery","display_name":"PitVQA: Image-grounded Text Embedding LLM for Visual Question Answering in Pituitary Surgery","publication_year":2024,"publication_date":"2024-05-22","ids":{"openalex":"https://openalex.org/W4399026183","doi":"https://doi.org/10.48550/arxiv.2405.13949"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2405.13949","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.13949","pdf_url":"https://arxiv.org/pdf/2405.13949","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2405.13949","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101366800","display_name":"Runlong He","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"He, Runlong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057223988","display_name":"Mengya Xu","orcid":"https://orcid.org/0000-0002-4338-7079"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Mengya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041912825","display_name":"Adrito Das","orcid":"https://orcid.org/0000-0001-5682-9545"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Das, Adrito","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002717050","display_name":"Danyal Z. Khan","orcid":"https://orcid.org/0000-0001-9213-2550"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khan, Danyal Z.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071609030","display_name":"Sophia Bano","orcid":"https://orcid.org/0000-0003-1329-4565"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bano, Sophia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076899222","display_name":"Hani J. Marcus","orcid":"https://orcid.org/0000-0001-8000-392X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Marcus, Hani J.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077630267","display_name":"Danail Stoyanov","orcid":"https://orcid.org/0000-0002-0980-3227"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stoyanov, Danail","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001311980","display_name":"Matthew J. Clarkson","orcid":"https://orcid.org/0000-0002-5565-1252"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Clarkson, Matthew J.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066653567","display_name":"Mobarakol Islam","orcid":"https://orcid.org/0000-0002-7162-2822"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Islam, Mobarakol","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5101366800"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9894999861717224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9894999861717224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.7201257348060608},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.5241332650184631},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5059639811515808},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3984697759151459},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.36743009090423584},{"id":"https://openalex.org/keywords/medicine","display_name":"Medicine","score":0.3413599133491516},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3161434531211853}],"concepts":[{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.7201257348060608},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.5241332650184631},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5059639811515808},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3984697759151459},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.36743009090423584},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.3413599133491516},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3161434531211853}],"mesh":[],"locations_count":3,"locations":[{"id":"pmh:oai:arXiv.org:2405.13949","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.13949","pdf_url":"https://arxiv.org/pdf/2405.13949","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"pmh:oai:eprints.ucl.ac.uk.OAI2:10203116","is_oa":true,"landing_page_url":"https://discovery.ucl.ac.uk/id/eprint/10203116/","pdf_url":"https://discovery.ucl.ac.uk/10203116/1/3403_paper.pdf","source":{"id":"https://openalex.org/S4306400024","display_name":"UCL Discovery (University College London)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I45129253","host_organization_name":"University College London","host_organization_lineage":["https://openalex.org/I45129253"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"In: Dou, Q and Linguraru, MG and Feragen, A and Giannarou, S and Glocker, B and Lekadir, K and Schnabel, JA, (eds.) Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2024.  (pp. pp. 488-498).  Springer Nature: Cham, Switzerland. (2024)","raw_type":"Proceedings paper"},{"id":"doi:10.48550/arxiv.2405.13949","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2405.13949","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2405.13949","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.13949","pdf_url":"https://arxiv.org/pdf/2405.13949","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1051717971","display_name":null,"funder_award_id":"863146","funder_id":"https://openalex.org/F4320320005","funder_display_name":"Royal Academy of Engineering"},{"id":"https://openalex.org/G2058139454","display_name":null,"funder_award_id":"EP/S021612/1","funder_id":"https://openalex.org/F4320320005","funder_display_name":"Royal Academy of Engineering"},{"id":"https://openalex.org/G4284291112","display_name":null,"funder_award_id":"EP/S021612/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G5935547209","display_name":null,"funder_award_id":"203145/Z/16/Z","funder_id":"https://openalex.org/F4320330765","funder_display_name":"Wellcome / EPSRC Centre for Interventional and Surgical Sciences"},{"id":"https://openalex.org/G6399526341","display_name":null,"funder_award_id":"UCLH/UCL","funder_id":"https://openalex.org/F4320319990","funder_display_name":"National Institute for Health and Care Research"},{"id":"https://openalex.org/G7638825437","display_name":"Context Aware Augmented Reality for Endonasal Endoscopic Surgery","funder_award_id":"EP/W00805X/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G7705462073","display_name":null,"funder_award_id":"863146","funder_id":"https://openalex.org/F4320330765","funder_display_name":"Wellcome / EPSRC Centre for Interventional and Surgical Sciences"},{"id":"https://openalex.org/G832900744","display_name":"AID-PitSurg: AI-enabled Decision support in Pituitary Surgery to reduce complications","funder_award_id":"EP/Y01958X/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G868016207","display_name":null,"funder_award_id":"203145/Z/16/Z","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G8925563342","display_name":null,"funder_award_id":"Horizon 2020 FET","funder_id":"https://openalex.org/F4320330765","funder_display_name":"Wellcome / EPSRC Centre for Interventional and Surgical Sciences"}],"funders":[{"id":"https://openalex.org/F4320319990","display_name":"National Institute for Health and Care Research","ror":"https://ror.org/0187kwz08"},{"id":"https://openalex.org/F4320320005","display_name":"Royal Academy of Engineering","ror":"https://ror.org/0526snb40"},{"id":"https://openalex.org/F4320325311","display_name":"University College London Hospitals NHS Foundation Trust","ror":"https://ror.org/042fqyp44"},{"id":"https://openalex.org/F4320330765","display_name":"Wellcome / EPSRC Centre for Interventional and Surgical Sciences","ror":"https://ror.org/03r42r570"},{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4399026183.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2384605597","https://openalex.org/W2387743295","https://openalex.org/W2115758952","https://openalex.org/W3082787378","https://openalex.org/W2136007095","https://openalex.org/W2366230879","https://openalex.org/W3208425359","https://openalex.org/W2081900870","https://openalex.org/W2312145515","https://openalex.org/W4225922023"],"abstract_inverted_index":{"Visual":[0],"Question":[1],"Answering":[2],"(VQA)":[3],"within":[4,175],"the":[5,27,37,52,64,74,100,176,196,206,212,215,230],"surgical":[6,32,109,125,202],"domain,":[7],"utilizing":[8],"Large":[9],"Language":[10],"Models":[11],"(LLMs),":[12],"offers":[13],"a":[14,84,103,117,145,157],"distinct":[15],"opportunity":[16],"to":[17,63,170,194],"improve":[18],"intra-operative":[19],"decision-making":[20],"and":[21,41,54,73,95,116,130,137,139,153,161,191,201,214,227,237],"facilitate":[22],"intuitive":[23],"surgeon-AI":[24],"interaction.":[25],"However,":[26],"development":[28],"of":[29,39,51,71,99,120,144,179,208,225],"LLMs":[30],"for":[31,89,108],"VQA":[33,90],"is":[34,239],"hindered":[35],"by":[36],"scarcity":[38],"diverse":[40],"extensive":[42],"datasets":[43],"with":[44,102,164],"complex":[45,177],"reasoning":[46],"tasks.":[47],"Moreover,":[48],"contextual":[49,192,197],"fusion":[50],"image":[53,152],"text":[55,106,148,154,185],"modalities":[56],"remains":[57],"an":[58,97,165],"open":[59],"research":[60],"challenge":[61],"due":[62],"inherent":[65],"differences":[66],"between":[67,199],"these":[68],"two":[69],"types":[70],"information":[72],"complexity":[75],"involved":[76],"in":[77,91,222],"aligning":[78],"them.":[79],"This":[80],"paper":[81],"introduces":[82],"PitVQA,":[83],"novel":[85,104,146],"dataset":[86,238],"specifically":[87],"designed":[88],"endonasal":[92,180],"pituitary":[93,181],"surgery":[94],"PitVQA-Net,":[96],"adaptation":[98],"GPT2":[101,162],"image-grounded":[105,147,184],"embedding":[107,149,159,186],"VQA.":[110],"PitVQA":[111,213],"comprises":[112],"25":[113],"procedural":[114],"videos":[115],"rich":[118],"collection":[119],"question-answer":[121],"pairs":[122],"spanning":[123],"crucial":[124],"aspects":[126],"such":[127],"as":[128],"phase":[129],"step":[131],"recognition,":[132],"context":[133],"understanding,":[134],"tool":[135],"detection":[136],"localization,":[138],"tool-tissue":[140],"interactions.":[141],"PitVQA-Net":[142,209],"consists":[143],"that":[150],"projects":[151],"features":[155],"into":[156],"shared":[158],"space":[160],"Backbone":[163],"excitation":[166],"block":[167],"classification":[168],"head":[169],"generate":[171],"contextually":[172],"relevant":[173],"answers":[174],"domain":[178],"surgery.":[182],"Our":[183,235],"leverages":[187],"joint":[188],"embedding,":[189],"cross-attention":[190],"representation":[193],"understand":[195],"relationship":[198],"questions":[200],"images.":[203],"We":[204],"demonstrate":[205],"effectiveness":[207],"on":[210],"both":[211],"publicly":[216],"available":[217,240],"EndoVis18-VQA":[218],"dataset,":[219],"achieving":[220],"improvements":[221],"balanced":[223],"accuracy":[224],"8%":[226],"9%":[228],"over":[229],"most":[231],"recent":[232],"baselines,":[233],"respectively.":[234],"code":[236],"at":[241],"https://github.com/mobarakol/PitVQA.":[242]},"counts_by_year":[],"updated_date":"2026-04-19T08:26:33.389920","created_date":"2025-10-10T00:00:00"}
