{"id":"https://openalex.org/W7119078187","doi":"https://doi.org/10.48550/arxiv.2601.01593","title":"Beyond Patches: Global-aware Autoregressive Model for Multimodal Few-Shot Font Generation","display_name":"Beyond Patches: Global-aware Autoregressive Model for Multimodal Few-Shot Font Generation","publication_year":2026,"publication_date":"2026-01-04","ids":{"openalex":"https://openalex.org/W7119078187","doi":"https://doi.org/10.48550/arxiv.2601.01593"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.01593","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01593","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.01593","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5122200790","display_name":"Haonan Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Cai, Haonan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101348014","display_name":"Yuxuan Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Yuxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5114425298","display_name":"Zhouhui Lian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lian, Zhouhui","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5122200790"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7627000212669373,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7627000212669373,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10481","display_name":"Computer Graphics and Visualization Techniques","score":0.039000000804662704,"subfield":{"id":"https://openalex.org/subfields/1704","display_name":"Computer Graphics and Computer-Aided Design"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.019500000402331352,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/font","display_name":"Font","score":0.8205000162124634},{"id":"https://openalex.org/keywords/glyph","display_name":"Glyph (data visualization)","score":0.6011999845504761},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.5108000040054321},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4668000042438507},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.41929998993873596},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4165000021457672},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.3635999858379364},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.35600000619888306}],"concepts":[{"id":"https://openalex.org/C2777737414","wikidata":"https://www.wikidata.org/wiki/Q4868296","display_name":"Font","level":2,"score":0.8205000162124634},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7366999983787537},{"id":"https://openalex.org/C142816647","wikidata":"https://www.wikidata.org/wiki/Q5573018","display_name":"Glyph (data visualization)","level":3,"score":0.6011999845504761},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5209000110626221},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5108000040054321},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4961000084877014},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4668000042438507},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.41929998993873596},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4165000021457672},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.3635999858379364},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.35600000619888306},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.33889999985694885},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3328999876976013},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.32710000872612},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.28380000591278076},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.27559998631477356},{"id":"https://openalex.org/C199776023","wikidata":"https://www.wikidata.org/wiki/Q202875","display_name":"Negotiation","level":2,"score":0.26339998841285706},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.2621999979019165},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.26089999079704285},{"id":"https://openalex.org/C177284502","wikidata":"https://www.wikidata.org/wiki/Q1005390","display_name":"Adapter (computing)","level":2,"score":0.25780001282691956},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.2574000060558319},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.01593","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01593","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.01593","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01593","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.4285639226436615,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Manual":[0],"font":[1,69,96,112],"design":[2],"is":[3,56],"an":[4],"intricate":[5],"process":[6],"that":[7,119,151,162],"transforms":[8],"a":[9,14,105,116,129,138,148],"stylistic":[10,38,93,127,180],"visual":[11,83],"concept":[12],"into":[13],"coherent":[15,68],"glyph":[16],"set.":[17],"This":[18],"challenge":[19],"persists":[20],"in":[21,91,169],"automated":[22],"Few-shot":[23],"Font":[24],"Generation":[25],"(FFG),":[26],"where":[27],"models":[28,46],"often":[29],"struggle":[30],"to":[31,54],"preserve":[32],"both":[33,122],"the":[34,77,87],"structural":[35,154],"integrity":[36],"and":[37,85,125,147,156,174],"fidelity":[39,155],"from":[40],"limited":[41],"references.":[42],"While":[43],"autoregressive":[44],"(AR)":[45],"have":[47],"demonstrated":[48],"impressive":[49],"generative":[50],"capabilities,":[51],"their":[52],"application":[53],"FFG":[55,73,166],"constrained":[57],"by":[58],"conventional":[59],"patch-level":[60],"tokenization,":[61],"which":[62],"neglects":[63],"global":[64,126,171],"dependencies":[65],"crucial":[66],"for":[67,109],"synthesis.":[70],"Moreover,":[71],"existing":[72,165],"methods":[74],"remain":[75],"within":[76],"image-to-image":[78],"paradigm,":[79],"relying":[80],"solely":[81],"on":[82],"references":[84],"overlooking":[86],"role":[88],"of":[89],"language":[90],"conveying":[92],"intent":[94],"during":[95],"design.":[97],"To":[98],"address":[99],"these":[100],"limitations,":[101],"we":[102],"propose":[103],"GAR-Font,":[104],"novel":[106],"AR":[107],"framework":[108],"multimodal":[110,130,145],"few-shot":[111],"generation.":[113],"GAR-Font":[114,163],"introduces":[115],"global-aware":[117],"tokenizer":[118],"effectively":[120],"captures":[121],"local":[123],"structures":[124],"patterns,":[128],"style":[131,135,157,172],"encoder":[132],"offering":[133],"flexible":[134],"control":[136],"through":[137],"lightweight":[139],"language-style":[140],"adapter":[141],"without":[142],"requiring":[143],"intensive":[144],"pretraining,":[146],"post-refinement":[149],"pipeline":[150],"further":[152],"enhances":[153],"coherence.":[158],"Extensive":[159],"experiments":[160],"show":[161],"outperforms":[164],"methods,":[167],"excelling":[168],"maintaining":[170],"faithfulness":[173],"achieving":[175],"higher-quality":[176],"results":[177],"with":[178],"textual":[179],"guidance.":[181]},"counts_by_year":[],"updated_date":"2026-01-08T20:10:11.968330","created_date":"2026-01-08T00:00:00"}
