{"id":"https://openalex.org/W7164851374","doi":"https://doi.org/10.1145/3805622.3810697","title":"QMTD: Query Vector Guided Multi-Scale Text Detection","display_name":"QMTD: Query Vector Guided Multi-Scale Text Detection","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164851374","doi":"https://doi.org/10.1145/3805622.3810697"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810697","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810697","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810697","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5078944975","display_name":"Zhiqiang You","orcid":"https://orcid.org/0000-0001-9924-0685"},"institutions":[{"id":"https://openalex.org/I16609230","display_name":"Hunan University","ror":"https://ror.org/05htk5m33","country_code":"CN","type":"education","lineage":["https://openalex.org/I16609230"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiqiang You","raw_affiliation_strings":["Hunan University, Changsha, China"],"raw_orcid":"https://orcid.org/0000-0001-9924-0685","affiliations":[{"raw_affiliation_string":"Hunan University, Changsha, China","institution_ids":["https://openalex.org/I16609230"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019943830","display_name":"Yutong Jiang","orcid":"https://orcid.org/0000-0001-6558-4201"},"institutions":[{"id":"https://openalex.org/I16609230","display_name":"Hunan University","ror":"https://ror.org/05htk5m33","country_code":"CN","type":"education","lineage":["https://openalex.org/I16609230"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yutong Jiang","raw_affiliation_strings":["Hunan University, Changsha, China"],"raw_orcid":"https://orcid.org/0009-0000-4483-9498","affiliations":[{"raw_affiliation_string":"Hunan University, Changsha, China","institution_ids":["https://openalex.org/I16609230"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032549844","display_name":"Shenguang Huang","orcid":"https://orcid.org/0000-0002-1387-2153"},"institutions":[{"id":"https://openalex.org/I13591777","display_name":"University of Nottingham Ningbo China","ror":"https://ror.org/03y4dt428","country_code":"CN","type":"education","lineage":["https://openalex.org/I13591777","https://openalex.org/I142263535"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shenguang Huang","raw_affiliation_strings":["Ningbo Port Information Communication Co., Ltd., Ningbo, China"],"raw_orcid":"https://orcid.org/0000-0002-1387-2153","affiliations":[{"raw_affiliation_string":"Ningbo Port Information Communication Co., Ltd., Ningbo, China","institution_ids":["https://openalex.org/I13591777"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100413276","display_name":"Liu Z","orcid":"https://orcid.org/0000-0002-9199-6493"},"institutions":[{"id":"https://openalex.org/I16609230","display_name":"Hunan University","ror":"https://ror.org/05htk5m33","country_code":"CN","type":"education","lineage":["https://openalex.org/I16609230"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhangjie Liu","raw_affiliation_strings":["Hunan University, Changsha, China"],"raw_orcid":"https://orcid.org/0009-0008-7935-3912","affiliations":[{"raw_affiliation_string":"Hunan University, Changsha, China","institution_ids":["https://openalex.org/I16609230"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024137801","display_name":"\u5434\u9ad8\u5fb7","orcid":null},"institutions":[{"id":"https://openalex.org/I13591777","display_name":"University of Nottingham Ningbo China","ror":"https://ror.org/03y4dt428","country_code":"CN","type":"education","lineage":["https://openalex.org/I13591777","https://openalex.org/I142263535"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Gaode Wu","raw_affiliation_strings":["Ningbo Port Information Communication Co., Ltd., Ningbo, China"],"raw_orcid":"https://orcid.org/0009-0008-5189-9534","affiliations":[{"raw_affiliation_string":"Ningbo Port Information Communication Co., Ltd., Ningbo, China","institution_ids":["https://openalex.org/I13591777"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5138665076","display_name":"Haoyu Wang","orcid":"https://orcid.org/0009-0006-9843-1619"},"institutions":[{"id":"https://openalex.org/I4210112286","display_name":"The Metropolitan Opera (United States)","ror":"https://ror.org/022q3vw94","country_code":"US","type":"company","lineage":["https://openalex.org/I4210112286"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Haoyu Wang","raw_affiliation_strings":["Meta, NewYork, USA"],"raw_orcid":"https://orcid.org/0009-0006-9843-1619","affiliations":[{"raw_affiliation_string":"Meta, NewYork, USA","institution_ids":["https://openalex.org/I4210112286"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93914919,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1346","last_page":"1354"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.972000002861023,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.972000002861023,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.007000000216066837,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.0038999998942017555,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.6468999981880188},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5494999885559082},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4957999885082245},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.48579999804496765},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.4690999984741211},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.462799996137619},{"id":"https://openalex.org/keywords/feature-vector","display_name":"Feature vector","score":0.4521999955177307},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.42559999227523804},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.4147000014781952}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8109999895095825},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.6468999981880188},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5859000086784363},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5494999885559082},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4957999885082245},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.48579999804496765},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.4690999984741211},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.462799996137619},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.4521999955177307},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.42559999227523804},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4147000014781952},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.4081000089645386},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3637000024318695},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.36169999837875366},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.3564999997615814},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.3555000126361847},{"id":"https://openalex.org/C2983589003","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Text detection","level":3,"score":0.35249999165534973},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.35010001063346863},{"id":"https://openalex.org/C2983812711","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Text recognition","level":3,"score":0.33480000495910645},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.3321000039577484},{"id":"https://openalex.org/C99016210","wikidata":"https://www.wikidata.org/wiki/Q5488129","display_name":"Query expansion","level":2,"score":0.30489999055862427},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.30160000920295715},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.30090001225471497},{"id":"https://openalex.org/C7545210","wikidata":"https://www.wikidata.org/wiki/Q838123","display_name":"Data redundancy","level":2,"score":0.30079999566078186},{"id":"https://openalex.org/C1893757","wikidata":"https://www.wikidata.org/wiki/Q3653001","display_name":"Inversion (geology)","level":3,"score":0.2856999933719635},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.27160000801086426},{"id":"https://openalex.org/C194222762","wikidata":"https://www.wikidata.org/wiki/Q114486","display_name":"Query by Example","level":4,"score":0.2639999985694885},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.25769999623298645},{"id":"https://openalex.org/C72041958","wikidata":"https://www.wikidata.org/wiki/Q185092","display_name":"Flowchart","level":2,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810697","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810697","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810697","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810697","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.4370036721229553,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G6098702915","display_name":null,"funder_award_id":"62074055","funder_id":"https://openalex.org/F4320334015","funder_display_name":"Joint Fund of the National Natural Science Foundation of China and the Karst Science Research Center of Guizhou Province"}],"funders":[{"id":"https://openalex.org/F4320334015","display_name":"Joint Fund of the National Natural Science Foundation of China and the Karst Science Research Center of Guizhou Province","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":39,"referenced_works":["https://openalex.org/W2144554289","https://openalex.org/W2343052201","https://openalex.org/W2784050770","https://openalex.org/W2810028092","https://openalex.org/W2953894958","https://openalex.org/W2963150697","https://openalex.org/W2963187132","https://openalex.org/W2963299604","https://openalex.org/W2963353821","https://openalex.org/W2963647456","https://openalex.org/W2963857746","https://openalex.org/W2964685115","https://openalex.org/W2967155990","https://openalex.org/W2967615747","https://openalex.org/W2991626090","https://openalex.org/W2998621280","https://openalex.org/W3034514377","https://openalex.org/W3034792612","https://openalex.org/W3181016597","https://openalex.org/W3184364189","https://openalex.org/W3189348500","https://openalex.org/W3205961173","https://openalex.org/W4214922754","https://openalex.org/W4304084026","https://openalex.org/W4312312588","https://openalex.org/W4312593844","https://openalex.org/W4312815172","https://openalex.org/W4372263563","https://openalex.org/W4380839071","https://openalex.org/W4385292311","https://openalex.org/W4386071792","https://openalex.org/W4392910571","https://openalex.org/W4402716130","https://openalex.org/W4402774074","https://openalex.org/W4402774414","https://openalex.org/W4410738649","https://openalex.org/W4416034400","https://openalex.org/W7118747301","https://openalex.org/W7123349412"],"related_works":[],"abstract_inverted_index":{"Scene":[0],"text":[1,25,199,216],"detection":[2],"is":[3,27,106],"a":[4,53,74,144,207],"fundamental":[5],"component":[6],"of":[7,41,89,127],"optical":[8],"character":[9],"recognition,":[10],"document":[11],"intelligence,":[12],"assistive":[13],"technologies,":[14],"multilingual":[15],"content":[16],"retrieval,":[17],"and":[18,158,172,191,197,210],"multimodal":[19],"perception":[20],"systems.":[21],"However,":[22],"accurately":[23],"detecting":[24],"that":[26,62,176],"curved,":[28],"stylized,":[29],"densely":[30],"packed,":[31],"or":[32,180],"small-scale":[33],"in":[34,45,113],"natural":[35],"scenes":[36],"remains":[37],"challenging\u2014limiting":[38],"the":[39,65,70,81,86,90,96,114,124,128,164,186],"reliability":[40],"text-dependent":[42],"vision":[43],"systems":[44],"real-world":[46],"applications.":[47],"In":[48],"this":[49],"paper,":[50],"we":[51],"propose":[52],"Query":[54],"vector":[55],"guided":[56],"Multi-scale":[57],"Text":[58],"Detection":[59],"algorithm":[60],"(QMTD)":[61],"dynamically":[63],"generates":[64],"initial":[66,82],"query":[67,75,83],"vectors":[68,84],"for":[69,123,213],"transformer":[71,115],"decoder":[72,111,133],"using":[73],"embedding":[76],"initialization":[77],"module.":[78],"By":[79],"extracting":[80],"from":[85],"pixel":[87],"features":[88,112],"input":[91],"feature":[92,146,152,160],"map,":[93],"QMTD":[94,142,177,205],"improves":[95],"model\u2019s":[97],"generalization":[98],"ability.":[99],"Additionally,":[100],"an":[101],"attention":[102],"region":[103],"guidance":[104],"module":[105],"introduced":[107],"to":[108,116,154,162],"exploit":[109],"multi-layer":[110],"direct":[117],"subsequent":[118],"computational":[119,165],"processes,":[120],"which":[121,149],"allows":[122],"step-by-step":[125],"refinement":[126],"predicted":[129],"results":[130],"across":[131,195],"multiple":[132],"layers,":[134],"accelerating":[135],"training":[136],"while":[137],"improving":[138],"model":[139],"performance.":[140],"Finally,":[141],"incorporates":[143],"multi-scale":[145],"enhancement":[147],"module,":[148],"uses":[150],"high-resolution":[151],"maps":[153,161],"improve":[155],"segmentation":[156],"precision":[157],"low-resolution":[159],"balance":[163],"time":[166],"required.":[167],"Evaluations":[168],"on":[169],"CTW1500,":[170],"Total-Text,":[171],"ICDAR2015":[173],"benchmarks":[174],"demonstrate":[175],"achieves":[178],"competitive":[179],"state-of-the-art":[181],"H-mean,":[182],"converges":[183],"faster":[184],"than":[185],"DTTR":[187],"baseline":[188],"during":[189],"training,":[190],"maintains":[192],"robust":[193,214],"performance":[194],"curved":[196],"multi-oriented":[198],"scenarios.":[200],"These":[201],"findings":[202],"collectively":[203],"establish":[204],"as":[206],"highly":[208],"effective":[209],"practical":[211],"solution":[212],"scene":[215],"detection.":[217]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
