{"id":"https://openalex.org/W7127120414","doi":"https://doi.org/10.1007/s00521-025-11721-5","title":"VTFCGNet: a novel cross-modal reasoning network integrating Fourier self-attention and graph attention for visual text question answering","display_name":"VTFCGNet: a novel cross-modal reasoning network integrating Fourier self-attention and graph attention for visual text question answering","publication_year":2026,"publication_date":"2026-02-01","ids":{"openalex":"https://openalex.org/W7127120414","doi":"https://doi.org/10.1007/s00521-025-11721-5"},"language":"en","primary_location":{"id":"doi:10.1007/s00521-025-11721-5","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00521-025-11721-5","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00521-025-11721-5.pdf","source":{"id":"https://openalex.org/S147897268","display_name":"Neural Computing and Applications","issn_l":"0941-0643","issn":["0941-0643","1433-3058"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Neural Computing and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s00521-025-11721-5.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5020641943","display_name":"Yujie Huo","orcid":null},"institutions":[{"id":"https://openalex.org/I4576418","display_name":"University of Technology Malaysia","ror":"https://ror.org/026w31v75","country_code":"MY","type":"education","lineage":["https://openalex.org/I4576418"]}],"countries":["MY"],"is_corresponding":true,"raw_author_name":"Yujie Huo","raw_affiliation_strings":["Faculty of Computing, Universiti Teknologi Malaysia, UTM Skudai, 81310, Johor Bahru, Johor, Malaysia"],"raw_orcid":"https://orcid.org/0009-0001-3123-2803","affiliations":[{"raw_affiliation_string":"Faculty of Computing, Universiti Teknologi Malaysia, UTM Skudai, 81310, Johor Bahru, Johor, Malaysia","institution_ids":["https://openalex.org/I4576418"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047121335","display_name":"Weng Howe Chan","orcid":"https://orcid.org/0000-0003-0612-3661"},"institutions":[{"id":"https://openalex.org/I4576418","display_name":"University of Technology Malaysia","ror":"https://ror.org/026w31v75","country_code":"MY","type":"education","lineage":["https://openalex.org/I4576418"]}],"countries":["MY"],"is_corresponding":false,"raw_author_name":"Weng Howe Chan","raw_affiliation_strings":["Faculty of Computing, Universiti Teknologi Malaysia, UTM Skudai, 81310, Johor Bahru, Johor, Malaysia","UTM Big Data Centre, Ibnu Sina Institute For Scientific and Industrial Research Universiti Teknologi Malaysia, UTM Skudai, 81310, Johor Bahru, Johor, Malaysia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Faculty of Computing, Universiti Teknologi Malaysia, UTM Skudai, 81310, Johor Bahru, Johor, Malaysia","institution_ids":["https://openalex.org/I4576418"]},{"raw_affiliation_string":"UTM Big Data Centre, Ibnu Sina Institute For Scientific and Industrial Research Universiti Teknologi Malaysia, UTM Skudai, 81310, Johor Bahru, Johor, Malaysia","institution_ids":["https://openalex.org/I4576418"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124791881","display_name":"Song Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]},{"id":"https://openalex.org/I48168531","display_name":"South University","ror":"https://ror.org/01rjfjt94","country_code":"US","type":"education","lineage":["https://openalex.org/I48168531"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Song Yu","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, 932 Lushan South Road, Changsha, 410083, Hunan Province, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, 932 Lushan South Road, Changsha, 410083, Hunan Province, China","institution_ids":["https://openalex.org/I139660479","https://openalex.org/I48168531"]}]},{"author_position":"last","author":{"id":null,"display_name":"Hongyu Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongyu Gao","raw_affiliation_strings":["Research Institute of Electronic Science and Technology, University of Electronic Science and Technology of China, 2006 Xiyuan Avenue, Chengdu, 611731, Sichuan Province, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Research Institute of Electronic Science and Technology, University of Electronic Science and Technology of China, 2006 Xiyuan Avenue, Chengdu, 611731, Sichuan Province, China","institution_ids":["https://openalex.org/I150229711"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5020641943"],"corresponding_institution_ids":["https://openalex.org/I4576418"],"apc_list":{"value":2390,"currency":"EUR","value_usd":2990},"apc_paid":{"value":2390,"currency":"EUR","value_usd":2990},"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18633559,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"38","issue":"3","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9909999966621399,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9909999966621399,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.0017999999690800905,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.00139999995008111,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.8266000151634216},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.57669997215271},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.5360000133514404},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.5088000297546387},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.4860999882221222},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4595000147819519},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.451200008392334},{"id":"https://openalex.org/keywords/fourier-domain","display_name":"Fourier domain","score":0.42660000920295715}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.8266000151634216},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8237000107765198},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6069999933242798},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.57669997215271},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.5360000133514404},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.5088000297546387},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.4860999882221222},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4595000147819519},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.451200008392334},{"id":"https://openalex.org/C3019555358","wikidata":"https://www.wikidata.org/wiki/Q786423","display_name":"Fourier domain","level":3,"score":0.42660000920295715},{"id":"https://openalex.org/C68597687","wikidata":"https://www.wikidata.org/wiki/Q362601","display_name":"Computational Science and Engineering","level":2,"score":0.41130000352859497},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.40380001068115234},{"id":"https://openalex.org/C102519508","wikidata":"https://www.wikidata.org/wiki/Q6520159","display_name":"Fourier transform","level":2,"score":0.3977000117301941},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3808000087738037},{"id":"https://openalex.org/C19118579","wikidata":"https://www.wikidata.org/wiki/Q786423","display_name":"Frequency domain","level":2,"score":0.36880001425743103},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3675000071525574},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.36320000886917114},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32330000400543213},{"id":"https://openalex.org/C104122410","wikidata":"https://www.wikidata.org/wiki/Q1416406","display_name":"Network model","level":2,"score":0.298799991607666},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C2993807640","wikidata":"https://www.wikidata.org/wiki/Q103709453","display_name":"Attention network","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.28760001063346863},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2662999927997589}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s00521-025-11721-5","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00521-025-11721-5","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00521-025-11721-5.pdf","source":{"id":"https://openalex.org/S147897268","display_name":"Neural Computing and Applications","issn_l":"0941-0643","issn":["0941-0643","1433-3058"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Neural Computing and Applications","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s00521-025-11721-5","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00521-025-11721-5","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00521-025-11721-5.pdf","source":{"id":"https://openalex.org/S147897268","display_name":"Neural Computing and Applications","issn_l":"0941-0643","issn":["0941-0643","1433-3058"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Neural Computing and Applications","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320323300","display_name":"Universiti Teknologi Malaysia","ror":"https://ror.org/026w31v75"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7127120414.pdf","grobid_xml":"https://content.openalex.org/works/W7127120414.grobid-xml"},"referenced_works_count":37,"referenced_works":["https://openalex.org/W16066671","https://openalex.org/W2064675550","https://openalex.org/W2250539671","https://openalex.org/W2560730294","https://openalex.org/W2563399268","https://openalex.org/W2745461083","https://openalex.org/W2884585870","https://openalex.org/W2916723116","https://openalex.org/W2947312908","https://openalex.org/W2963477107","https://openalex.org/W2966683369","https://openalex.org/W2970231061","https://openalex.org/W2998374885","https://openalex.org/W3035454069","https://openalex.org/W3090449556","https://openalex.org/W3152635550","https://openalex.org/W4210836196","https://openalex.org/W4283775799","https://openalex.org/W4288265104","https://openalex.org/W4307724164","https://openalex.org/W4372342721","https://openalex.org/W4379929708","https://openalex.org/W4380201463","https://openalex.org/W4387802575","https://openalex.org/W4388181804","https://openalex.org/W4389140516","https://openalex.org/W4400767105","https://openalex.org/W4402698359","https://openalex.org/W4402753862","https://openalex.org/W4403576622","https://openalex.org/W4405011535","https://openalex.org/W4405921195","https://openalex.org/W4406160879","https://openalex.org/W4406501268","https://openalex.org/W4406611421","https://openalex.org/W4406657770","https://openalex.org/W4407172046"],"related_works":[],"abstract_inverted_index":{"Traditional":[0],"visual":[1,12,164],"question":[2,14,75],"answering":[3,15],"(VQA)":[4],"tasks":[5,17],"focus":[6],"on":[7,93,154,186],"surface":[8],"image-text":[9],"matching,":[10],"while":[11],"text":[13],"(VTQA)":[16],"require":[18],"deeper":[19],"cross-modal":[20,144],"reasoning.":[21],"Current":[22],"Transformer-based":[23],"models":[24,83],"are":[25],"insufficient":[26],"in":[27,84,99],"screening":[28],"effective":[29],"features.":[30],"To":[31],"address":[32],"these":[33],"issues,":[34],"this":[35],"paper":[36],"proposes":[37],"a":[38,123],"new":[39],"cross-media":[40,124,132],"reasoning":[41,125],"network":[42,58,97,126],"(VTFCGNet)":[43],"that":[44,177],"integrates":[45],"Fourier":[46,102],"frequency":[47,103],"domain":[48,51,104],"and":[49,53,74,77,105,142,156,162,184,191],"spatial":[50,107],"self-attention":[52,120],"graph":[54],"attention":[55],"mechanisms.":[56],"The":[57,174],"can":[59],"adaptively":[60],"weight":[61],"the":[62,79,94,101,106,111,118,169,187],"feature":[63],"interactions":[64],"between":[65],"different":[66],"modalities,":[67,76],"achieve":[68],"deep":[69],"fusion":[70],"of":[71,81,113,166,172,182],"image,":[72],"text,":[73],"overcome":[78],"limitations":[80],"existing":[82],"VTQA":[85,155,192],"tasks.":[86],"VTFCGNet":[87,178],"first":[88],"extracts":[89],"key":[90],"entities":[91],"based":[92],"entity":[95],"extraction":[96],"(VTFC-Net)":[98],"both":[100,160],"domain,":[108],"thereby":[109],"reducing":[110],"interference":[112],"redundant":[114],"features":[115,141,165],"compared":[116,146],"to":[117,138,147],"traditional":[119,148],"mechanism.":[121],"Secondly,":[122],"(CRG-Net)":[127],"is":[128],"employed":[129],"for":[130],"multi-step":[131],"reasoning,":[133],"significantly":[134],"enhancing":[135],"its":[136],"ability":[137],"capture":[139],"fine-grained":[140],"model":[143],"relationships":[145],"VQA":[149,157,188],"models.":[150],"Finally,":[151],"comprehensive":[152],"experiments":[153],"v2":[158,189],"datasets\u2014using":[159],"grid-level":[161],"region-level":[163],"region":[167],"proposals\u2014validate":[168],"outstanding":[170],"performance":[171],"VTFCGNet.":[173],"findings":[175],"demonstrate":[176],"achieved":[179],"top":[180],"accuracies":[181],"71.93%":[183],"75.83%":[185],"test-dev":[190],"test":[193],"(English":[194],"Version)":[195],"datasets,":[196],"respectively.":[197]},"counts_by_year":[],"updated_date":"2026-04-30T09:15:22.047038","created_date":"2026-02-03T00:00:00"}
