{"id":"https://openalex.org/W4385292133","doi":"https://doi.org/10.48550/arxiv.2307.13244","title":"Multi-Granularity Prediction with Learnable Fusion for Scene Text Recognition","display_name":"Multi-Granularity Prediction with Learnable Fusion for Scene Text Recognition","publication_year":2023,"publication_date":"2023-07-25","ids":{"openalex":"https://openalex.org/W4385292133","doi":"https://doi.org/10.48550/arxiv.2307.13244"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2307.13244","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2307.13244","pdf_url":"https://arxiv.org/pdf/2307.13244","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2307.13244","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101598629","display_name":"Cheng Da","orcid":"https://orcid.org/0000-0003-3037-7135"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Da, Cheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100396039","display_name":"Peng Wang","orcid":"https://orcid.org/0000-0002-5397-9115"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100313026","display_name":"Cong Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Cong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101598629"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9847000241279602,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.982699990272522,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7888225317001343},{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.607882559299469},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5986288785934448},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5237051248550415},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5130568742752075},{"id":"https://openalex.org/keywords/generality","display_name":"Generality","score":0.49261051416397095},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4847218990325928},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.45329684019088745},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3606293201446533},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.2001509666442871}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7888225317001343},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.607882559299469},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5986288785934448},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5237051248550415},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5130568742752075},{"id":"https://openalex.org/C2780767217","wikidata":"https://www.wikidata.org/wiki/Q5532421","display_name":"Generality","level":2,"score":0.49261051416397095},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4847218990325928},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.45329684019088745},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3606293201446533},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2001509666442871},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.0},{"id":"https://openalex.org/C542102704","wikidata":"https://www.wikidata.org/wiki/Q183257","display_name":"Psychotherapist","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2307.13244","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2307.13244","pdf_url":"https://arxiv.org/pdf/2307.13244","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2307.13244","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2307.13244","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2307.13244","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2307.13244","pdf_url":"https://arxiv.org/pdf/2307.13244","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"score":0.800000011920929,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4385292133.pdf","grobid_xml":"https://content.openalex.org/works/W4385292133.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2045049461","https://openalex.org/W1978893398","https://openalex.org/W2201908702","https://openalex.org/W4381094582","https://openalex.org/W2369625323","https://openalex.org/W2364579609","https://openalex.org/W1977906818","https://openalex.org/W1522139108","https://openalex.org/W2353528968","https://openalex.org/W2032776242"],"abstract_inverted_index":{"Due":[0],"to":[1,66,121,153,190,197],"the":[2,59,125,129,148,154,169,178,192,237,240],"enormous":[3],"technical":[4],"challenges":[5],"and":[6,38,82,87,107,139,247],"wide":[7],"range":[8],"of":[9,195,209,239],"applications,":[10],"scene":[11,99,215,233],"text":[12,100,216,234],"recognition":[13,171,207],"(STR)":[14],"has":[15,45],"been":[16,35],"an":[17,132,198,205],"active":[18],"research":[19],"topic":[20],"in":[21,62,131,143,151],"computer":[22],"vision":[23,74,105],"for":[24,98,175,214],"years.":[25],"To":[26,110,167],"tackle":[27],"this":[28,52],"tough":[29],"problem,":[30],"numerous":[31],"innovative":[32],"methods":[33],"have":[34],"successively":[36],"proposed,":[37],"incorporating":[39],"linguistic":[40,112],"knowledge":[41],"into":[42,128,147],"STR":[43,75,196],"models":[44,97,106,248],"recently":[46],"become":[47],"a":[48,68,83,117],"prominent":[49],"trend.":[50],"In":[51],"work,":[53],"we":[54,114],"first":[55],"draw":[56],"inspiration":[57],"from":[58,124],"recent":[60],"progress":[61],"Vision":[63],"Transformer":[64],"(ViT)":[65],"construct":[67],"conceptually":[69],"simple":[70],"yet":[71],"functionally":[72],"powerful":[73],"model,":[76],"which":[77],"is":[78,165,188],"built":[79],"upon":[80],"ViT":[81],"tailored":[84],"Adaptive":[85],"Addressing":[86],"Aggregation":[88],"(A$^3$)":[89],"module.":[90],"It":[91],"already":[92],"outperforms":[93],"most":[94],"previous":[95],"state-of-the-art":[96,222],"recognition,":[101],"including":[102],"both":[103],"pure":[104],"language-augmented":[108],"methods.":[109],"integrate":[111],"knowledge,":[113],"further":[115],"propose":[116],"Multi-Granularity":[118],"Prediction":[119],"strategy":[120],"inject":[122],"information":[123],"language":[126,162],"modality":[127],"model":[130,163],"implicit":[133],"way,":[134],"\\ie,":[135],"subword":[136],"representations":[137],"(BPE":[138],"WordPiece)":[140],"widely":[141],"used":[142],"NLP":[144],"are":[145,181],"introduced":[146],"output":[149],"space,":[150],"addition":[152],"conventional":[155],"character":[156],"level":[157],"representation,":[158],"while":[159],"no":[160],"independent":[161],"(LM)":[164],"adopted.":[166],"produce":[168],"final":[170],"results,":[172],"two":[173],"strategies":[174],"effectively":[176],"fusing":[177],"multi-granularity":[179],"predictions":[180],"devised.":[182],"The":[183,244],"resultant":[184],"algorithm":[185],"(termed":[186],"MGP-STR)":[187],"able":[189],"push":[191],"performance":[193],"envelope":[194],"even":[199],"higher":[200],"level.":[201],"Specifically,":[202],"MGP-STR":[203,242],"achieves":[204,221],"average":[206],"accuracy":[208],"$94\\%$":[210],"on":[211,224],"standard":[212],"benchmarks":[213,227],"recognition.":[217],"Moreover,":[218],"it":[219],"also":[220],"results":[223],"widely-used":[225],"handwritten":[226],"as":[228,230],"well":[229],"more":[231],"challenging":[232],"datasets,":[235],"demonstrating":[236],"generality":[238],"proposed":[241],"algorithm.":[243],"source":[245],"code":[246],"will":[249],"be":[250],"available":[251],"at:":[252],"\\url{https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR}.":[253]},"counts_by_year":[{"year":2024,"cited_by_count":4}],"updated_date":"2026-03-08T08:50:53.379069","created_date":"2023-07-27T00:00:00"}
