{"id":"https://openalex.org/W6967153802","doi":"https://doi.org/10.48550/arxiv.2506.04807","title":"MegaHan97K: A Large-Scale Dataset for Mega-Category Chinese Character Recognition with over 97K Categories","display_name":"MegaHan97K: A Large-Scale Dataset for Mega-Category Chinese Character Recognition with over 97K Categories","publication_year":2025,"publication_date":"2025-06-05","ids":{"openalex":"https://openalex.org/W6967153802","doi":"https://doi.org/10.48550/arxiv.2506.04807"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2506.04807","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.04807","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2506.04807","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhang, Yuyi","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Yuyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Shi, Yongxin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Yongxin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Peirong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Peirong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhao, Yixin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yixin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang, Zhenhua","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhenhua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Jin, Lianwen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Lianwen","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.001500000013038516,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.0006000000284984708,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.6687999963760376},{"id":"https://openalex.org/keywords/character","display_name":"Character (mathematics)","score":0.6360999941825867},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.5526999831199646},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.47699999809265137},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.4724999964237213},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.4397999942302704}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.6687999963760376},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6582000255584717},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.6360999941825867},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.5526999831199646},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5418999791145325},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5163999795913696},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.47699999809265137},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.4724999964237213},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.4397999942302704},{"id":"https://openalex.org/C2987247673","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Character recognition","level":3,"score":0.4099000096321106},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3578999936580658},{"id":"https://openalex.org/C2781051154","wikidata":"https://www.wikidata.org/wiki/Q8201","display_name":"Chinese characters","level":2,"score":0.351500004529953},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C67905146","wikidata":"https://www.wikidata.org/wiki/Q5287646","display_name":"Document processing","level":2,"score":0.30559998750686646},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2671000063419342},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2547999918460846}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2506.04807","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.04807","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2506.04807","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.04807","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Sustainable cities and communities","score":0.7337555885314941,"id":"https://metadata.un.org/sdg/11"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Foundational":[0],"to":[1,62,109],"the":[2,16,63,69,106,112,130,182,187,191,194,200,208],"Chinese":[3,7,18,95],"language":[4],"and":[5,12,46,148,169],"culture,":[6],"characters":[8],"encompass":[9],"extraordinarily":[10],"extensive":[11],"ever-expanding":[13],"categories,":[14],"with":[15,68,193],"latest":[17,113],"GB18030-2022":[19,114],"standard":[20],"containing":[21,73],"87,887":[22],"categories.":[23,76],"The":[24,214],"accurate":[25],"recognition":[26,58],"of":[27,31,65,94,184,202,211],"this":[28,79],"vast":[29],"number":[30],"characters,":[32],"termed":[33],"mega-category":[34,57,159],"recognition,":[35,168],"presents":[36],"a":[37,85],"formidable":[38],"yet":[39],"crucial":[40],"challenge":[41],"for":[42,178],"cultural":[43],"heritage":[44],"preservation":[45],"digital":[47],"applications.":[48],"Despite":[49],"significant":[50],"advances":[51],"in":[52,158,199,207],"Optical":[53],"Character":[54],"Recognition":[55],"(OCR),":[56],"remains":[59],"unexplored":[60],"due":[61],"absence":[64],"comprehensive":[66],"datasets,":[67],"largest":[70,195],"existing":[71,124],"dataset":[72,88,108,192,215],"merely":[74],"16,151":[75],"To":[77,181],"bridge":[78],"critical":[80],"gap,":[81],"we":[82],"introduce":[83],"MegaHan97K,":[84],"mega-category,":[86],"large-scale":[87],"covering":[89],"an":[90],"unprecedented":[91],"97,455":[92],"categories":[93,122,140],"characters.":[96],"Our":[97],"work":[98],"offers":[99],"three":[100,143],"major":[101],"contributions:":[102],"(1)":[103],"MegaHan97K":[104],"is":[105,189,216],"first":[107],"fully":[110],"support":[111],"standard,":[115],"providing":[116,135],"at":[117,218],"least":[118],"six":[119],"times":[120],"more":[121],"than":[123],"datasets;":[125],"(2)":[126],"It":[127],"effectively":[128],"addresses":[129],"long-tail":[131],"distribution":[132],"problem":[133],"by":[134],"balanced":[136],"samples":[137],"across":[138],"all":[139],"through":[141],"its":[142],"distinct":[144],"subsets:":[145],"handwritten,":[146],"historical":[147],"synthetic":[149],"subsets;":[150],"(3)":[151],"Comprehensive":[152],"benchmarking":[153],"experiments":[154],"reveal":[155],"new":[156],"challenges":[157],"scenarios,":[160],"including":[161],"increased":[162],"storage":[163],"demands,":[164],"morphologically":[165],"similar":[166],"character":[167],"zero-shot":[170],"learning":[171],"difficulties,":[172],"while":[173],"also":[174,206],"unlocking":[175],"substantial":[176],"opportunities":[177],"future":[179],"research.":[180],"best":[183],"our":[185],"knowledge,":[186],"MetaHan97K":[188],"likely":[190],"classes":[196],"not":[197],"only":[198],"field":[201],"OCR":[203],"but":[204],"may":[205],"broader":[209],"domain":[210],"pattern":[212],"recognition.":[213],"available":[217],"https://github.com/SCUT-DLVCLab/MegaHan97K.":[219]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
