{"id":"https://openalex.org/W4405655795","doi":"https://doi.org/10.48550/arxiv.2412.15213","title":"Flowing from Words to Pixels: A Noise-Free Framework for Cross-Modality Evolution","display_name":"Flowing from Words to Pixels: A Noise-Free Framework for Cross-Modality Evolution","publication_year":2024,"publication_date":"2024-12-19","ids":{"openalex":"https://openalex.org/W4405655795","doi":"https://doi.org/10.48550/arxiv.2412.15213"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2412.15213","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.15213","pdf_url":"https://arxiv.org/pdf/2412.15213","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.15213","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069192496","display_name":"Qihao Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Qihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012540341","display_name":"Xi Yin","orcid":"https://orcid.org/0000-0002-7519-1978"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Xi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086706224","display_name":"Alan Yuille","orcid":"https://orcid.org/0000-0001-5207-9249"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuille, Alan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112439220","display_name":"Andrew H. Brown","orcid":"https://orcid.org/0000-0002-4565-533X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brown, Andrew","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5060023106","display_name":"Mannat Singh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singh, Mannat","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5069192496"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.83160001039505,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.83160001039505,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11975","display_name":"Evolutionary Algorithms and Applications","score":0.8253999948501587,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.6673659682273865},{"id":"https://openalex.org/keywords/pixel","display_name":"Pixel","score":0.6581544876098633},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4503180682659149},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3682619333267212}],"concepts":[{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.6673659682273865},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.6581544876098633},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4503180682659149},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3682619333267212}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2412.15213","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.15213","pdf_url":"https://arxiv.org/pdf/2412.15213","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.15213","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.15213","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.15213","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.15213","pdf_url":"https://arxiv.org/pdf/2412.15213","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4405655795.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2385859805","https://openalex.org/W2530972254","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109"],"abstract_inverted_index":{"Diffusion":[0,79],"models,":[1,80],"and":[2,67,101,139,146,166,190,200,252],"their":[3],"generalization,":[4],"flow":[5,74,111,152,188],"matching,":[6,189],"have":[7],"had":[8],"a":[9,29,59,98,116,144,168,179],"remarkable":[10],"impact":[11],"on":[12,233],"the":[13,19,25,37,63,86,103,120,126,132,136,156,163,217,222,238],"field":[14],"of":[15,33,73,105,122,128,158,224],"media":[16,39,265],"generation.":[17,266],"Here,":[18],"conventional":[20],"approach":[21],"is":[22,55,76,232],"to":[23,36,53,89,114,125,162,170,260],"learn":[24,115],"complex":[26],"mapping":[27,50,118,245],"from":[28,51,119],"simple":[30,147],"source":[31,87],"distribution":[32,88,121,127,138],"Gaussian":[34],"noise":[35,52,137],"target":[38],"distribution.":[40],"For":[41],"cross-modal":[42,151,242,264],"tasks":[43],"such":[44],"as":[45],"text-to-image":[46],"generation,":[47],"this":[48,94,257],"same":[49],"image":[54,248,253],"learnt":[56],"whilst":[57],"including":[58],"conditioning":[60,140],"mechanism":[61],"in":[62,93,212,216,263],"model.":[64],"One":[65],"key":[66],"thus":[68,130],"far":[69],"relatively":[70],"unexplored":[71],"feature":[72],"matching":[75,112],"that,":[77],"unlike":[78],"they":[81],"are":[82],"not":[83],"constrained":[84],"for":[85,134,150,175,206,240],"be":[90],"noise.":[91],"Hence,":[92],"paper,":[95],"we":[96,107,191,227],"propose":[97],"paradigm":[99],"shift,":[100],"ask":[102],"question":[104],"whether":[106],"can":[108],"instead":[109],"train":[110],"models":[113],"direct":[117],"one":[123],"modality":[124],"another,":[129],"obviating":[131],"need":[133],"both":[135],"mechanism.":[141],"We":[142,154,255],"present":[143],"general":[145],"framework,":[148],"CrossFlow,":[149],"matching.":[153],"show":[155,192,229],"importance":[157],"applying":[159],"Variational":[160],"Encoders":[161],"input":[164],"data,":[165],"introduce":[167],"method":[169],"enable":[171],"Classifier-free":[172],"guidance.":[173],"Surprisingly,":[174],"text-to-image,":[176],"CrossFlow":[177,231],"with":[178,197,235],"vanilla":[180],"transformer":[181],"without":[182],"cross":[183],"attention":[184],"slightly":[185],"outperforms":[186,237],"standard":[187],"that":[193,230],"it":[194],"scales":[195],"better":[196],"training":[198],"steps":[199],"model":[201],"size,":[202],"while":[203],"also":[204,228],"allowing":[205],"interesting":[207],"latent":[208],"arithmetic":[209],"which":[210],"results":[211],"semantically":[213],"meaningful":[214],"edits":[215],"output":[218],"space.":[219],"To":[220],"demonstrate":[221],"generalizability":[223],"our":[225],"approach,":[226],"par":[234],"or":[236],"state-of-the-art":[239],"various":[241],"/":[243],"intra-modal":[244],"tasks,":[246],"viz.":[247],"captioning,":[249],"depth":[250],"estimation,":[251],"super-resolution.":[254],"hope":[256],"paper":[258],"contributes":[259],"accelerating":[261],"progress":[262]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2024-12-21T00:00:00"}
