{"id":"https://openalex.org/W4391128738","doi":"https://doi.org/10.1109/tpami.2024.3357631","title":"Every Problem, Every Step, All in Focus: Learning to Solve Vision-Language Problems With Integrated Attention","display_name":"Every Problem, Every Step, All in Focus: Learning to Solve Vision-Language Problems With Integrated Attention","publication_year":2024,"publication_date":"2024-01-23","ids":{"openalex":"https://openalex.org/W4391128738","doi":"https://doi.org/10.1109/tpami.2024.3357631","pmid":"https://pubmed.ncbi.nlm.nih.gov/38261479"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2024.3357631","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3357631","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5053676752","display_name":"Xianyu Chen","orcid":"https://orcid.org/0000-0002-9027-3920"},"institutions":[{"id":"https://openalex.org/I130238516","display_name":"University of Minnesota","ror":"https://ror.org/017zqws13","country_code":"US","type":"education","lineage":["https://openalex.org/I130238516"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Xianyu Chen","raw_affiliation_strings":["Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA","institution_ids":["https://openalex.org/I130238516"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034091254","display_name":"Jinhui Yang","orcid":"https://orcid.org/0000-0001-8322-1121"},"institutions":[{"id":"https://openalex.org/I130238516","display_name":"University of Minnesota","ror":"https://ror.org/017zqws13","country_code":"US","type":"education","lineage":["https://openalex.org/I130238516"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinhui Yang","raw_affiliation_strings":["Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA","institution_ids":["https://openalex.org/I130238516"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100362202","display_name":"Shi Chen","orcid":"https://orcid.org/0000-0002-3749-4767"},"institutions":[{"id":"https://openalex.org/I130238516","display_name":"University of Minnesota","ror":"https://ror.org/017zqws13","country_code":"US","type":"education","lineage":["https://openalex.org/I130238516"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shi Chen","raw_affiliation_strings":["Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA","institution_ids":["https://openalex.org/I130238516"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009001859","display_name":"Louis Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I130238516","display_name":"University of Minnesota","ror":"https://ror.org/017zqws13","country_code":"US","type":"education","lineage":["https://openalex.org/I130238516"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Louis Wang","raw_affiliation_strings":["Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA","institution_ids":["https://openalex.org/I130238516"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018896387","display_name":"Ming Jiang","orcid":"https://orcid.org/0000-0001-6439-5476"},"institutions":[{"id":"https://openalex.org/I130238516","display_name":"University of Minnesota","ror":"https://ror.org/017zqws13","country_code":"US","type":"education","lineage":["https://openalex.org/I130238516"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ming Jiang","raw_affiliation_strings":["Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA","institution_ids":["https://openalex.org/I130238516"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047419128","display_name":"Qi Zhao","orcid":"https://orcid.org/0000-0003-3054-8934"},"institutions":[{"id":"https://openalex.org/I130238516","display_name":"University of Minnesota","ror":"https://ror.org/017zqws13","country_code":"US","type":"education","lineage":["https://openalex.org/I130238516"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Qi Zhao","raw_affiliation_strings":["Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, University of Minnesota, Minneapolis, MN, USA","institution_ids":["https://openalex.org/I130238516"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5053676752"],"corresponding_institution_ids":["https://openalex.org/I130238516"],"apc_list":null,"apc_paid":null,"fwci":0.7873,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.69526122,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":96},"biblio":{"volume":"46","issue":"7","first_page":"4720","last_page":"4735"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9778000116348267,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8195913434028625},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.6878427863121033},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6112301349639893},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.5382393002510071},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5062865018844604},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.47488850355148315},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4642195701599121},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4486168622970581},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.419186532497406},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4108673334121704},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.4105672538280487},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.2776361107826233},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.17117169499397278}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8195913434028625},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.6878427863121033},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6112301349639893},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5382393002510071},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5062865018844604},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.47488850355148315},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4642195701599121},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4486168622970581},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.419186532497406},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4108673334121704},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.4105672538280487},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2776361107826233},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.17117169499397278},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpami.2024.3357631","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3357631","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:38261479","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/38261479","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":78,"referenced_works":["https://openalex.org/W1889081078","https://openalex.org/W2091019377","https://openalex.org/W2099614498","https://openalex.org/W2146950091","https://openalex.org/W2162558265","https://openalex.org/W2193145675","https://openalex.org/W2194775991","https://openalex.org/W2560730294","https://openalex.org/W2745461083","https://openalex.org/W2753232960","https://openalex.org/W2890399523","https://openalex.org/W2895420168","https://openalex.org/W2896457183","https://openalex.org/W2951113489","https://openalex.org/W2952132648","https://openalex.org/W2957775769","https://openalex.org/W2963503775","https://openalex.org/W2963516811","https://openalex.org/W2963518342","https://openalex.org/W2963644680","https://openalex.org/W2963703197","https://openalex.org/W2964094654","https://openalex.org/W2964187781","https://openalex.org/W2970231061","https://openalex.org/W2983256121","https://openalex.org/W2984008963","https://openalex.org/W2989868392","https://openalex.org/W2998014937","https://openalex.org/W3004349648","https://openalex.org/W3011215845","https://openalex.org/W3034655362","https://openalex.org/W3035454331","https://openalex.org/W3037725825","https://openalex.org/W3087871082","https://openalex.org/W3094502228","https://openalex.org/W3099043899","https://openalex.org/W3106250896","https://openalex.org/W3106768499","https://openalex.org/W3108161363","https://openalex.org/W3165695488","https://openalex.org/W3175445769","https://openalex.org/W3175888430","https://openalex.org/W3188318562","https://openalex.org/W3199088499","https://openalex.org/W3199332348","https://openalex.org/W3204924011","https://openalex.org/W3206675006","https://openalex.org/W3216130706","https://openalex.org/W4214482673","https://openalex.org/W4235646468","https://openalex.org/W4237129069","https://openalex.org/W4249013746","https://openalex.org/W4307845887","https://openalex.org/W4312478760","https://openalex.org/W4312518915","https://openalex.org/W4312825258","https://openalex.org/W4312885609","https://openalex.org/W4312933868","https://openalex.org/W4312956471","https://openalex.org/W4368754767","https://openalex.org/W4378908626","https://openalex.org/W4385245566","https://openalex.org/W4386065689","https://openalex.org/W4386075710","https://openalex.org/W4386076369","https://openalex.org/W4390874412","https://openalex.org/W6631190155","https://openalex.org/W6637568146","https://openalex.org/W6639055396","https://openalex.org/W6639432524","https://openalex.org/W6730782440","https://openalex.org/W6758599184","https://openalex.org/W6763000187","https://openalex.org/W6767057552","https://openalex.org/W6778883912","https://openalex.org/W6797109355","https://openalex.org/W6797464607","https://openalex.org/W6852217271"],"related_works":["https://openalex.org/W3157284875","https://openalex.org/W2259406085","https://openalex.org/W3009270862","https://openalex.org/W2099715052","https://openalex.org/W2147241511","https://openalex.org/W4226247999","https://openalex.org/W4213176082","https://openalex.org/W2187398150","https://openalex.org/W3209772662","https://openalex.org/W4200629926"],"abstract_inverted_index":{"Integrating":[0],"information":[1,155,162],"from":[2],"vision":[3,16],"and":[4,17,30,40,115,153,158,183],"language":[5,19,154],"modalities":[6],"has":[7],"sparked":[8],"interesting":[9],"applications":[10],"in":[11,25,36,180,191],"the":[12,85,121,124,132,186],"fields":[13],"of":[14,87,123,173,188],"computer":[15],"natural":[18],"processing.":[20],"Existing":[21],"methods,":[22],"though":[23],"promising":[24],"tasks":[26],"like":[27],"image":[28],"captioning":[29],"visual":[31,152],"question":[32],"answering,":[33],"face":[34],"challenges":[35],"understanding":[37],"real-life":[38],"issues":[39],"offering":[41],"step-by-step":[42],"solutions.":[43],"In":[44],"particular,":[45],"they":[46],"typically":[47],"limit":[48],"their":[49],"scope":[50],"to":[51,71,112],"solutions":[52],"with":[53,99,131,141],"a":[54,68,77,100,170],"sequential":[55,114],"structure,":[56],"thus":[57],"ignoring":[58],"complex":[59],"inter-step":[60],"dependencies.":[61],"To":[62,127],"bridge":[63],"this":[64,105,146],"gap,":[65],"we":[66,135],"propose":[67],"graph-based":[69],"approach":[70,190],"vision-language":[72,194],"problem":[73],"solving.":[74],"It":[75],"leverages":[76],"novel":[78],"integrated":[79,147],"attention":[80,106,130,142],"mechanism":[81,107],"that":[82,144],"jointly":[83],"considers":[84],"importance":[86],"features":[88],"within":[89,156],"each":[90],"step":[91],"as":[92,94],"well":[93],"across":[95],"multiple":[96],"steps.":[97,165],"Together":[98],"graph":[101],"neural":[102],"network":[103],"method,":[104],"can":[108],"be":[109],"progressively":[110],"learned":[111],"predict":[113],"non-sequential":[116],"solution":[117,175],"graphs":[118],"depending":[119],"on":[120,168],"characterization":[122],"problem-solving":[125,133],"process.":[126],"tightly":[128],"couple":[129],"procedure,":[134],"further":[136],"design":[137],"new":[138],"learning":[139],"objectives":[140],"metrics":[143],"quantify":[145],"attention,":[148],"which":[149],"better":[150],"aligns":[151],"steps,":[157],"more":[159],"accurately":[160],"captures":[161],"flow":[163],"between":[164],"Experimental":[166],"results":[167],"VisualHow,":[169],"comprehensive":[171],"dataset":[172],"varying":[174],"structures,":[176],"show":[177],"significant":[178],"improvements":[179],"predicting":[181],"steps":[182],"dependencies,":[184],"demonstrating":[185],"effectiveness":[187],"our":[189],"tackling":[192],"various":[193],"problems.":[195]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
