{"id":"https://openalex.org/W7129216225","doi":"https://doi.org/10.48550/arxiv.2602.13585","title":"Diff-Aid: Inference-time Adaptive Interaction Denoising for Rectified Text-to-Image Generation","display_name":"Diff-Aid: Inference-time Adaptive Interaction Denoising for Rectified Text-to-Image Generation","publication_year":2026,"publication_date":"2026-02-14","ids":{"openalex":"https://openalex.org/W7129216225","doi":"https://doi.org/10.48550/arxiv.2602.13585"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.13585","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.13585","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.13585","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5037293602","display_name":"Binglei Li","orcid":"https://orcid.org/0009-0005-5803-120X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Binglei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058786230","display_name":"Mengping Yang","orcid":"https://orcid.org/0000-0003-1503-9621"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Mengping","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111532053","display_name":"Zhiyu Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Zhiyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126191852","display_name":"Junping Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Junping","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126257712","display_name":"Hao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Hao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5037293602"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7967000007629395,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7967000007629395,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10481","display_name":"Computer Graphics and Visualization Techniques","score":0.05849999934434891,"subfield":{"id":"https://openalex.org/subfields/1704","display_name":"Computer Graphics and Computer-Aided Design"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.030700000002980232,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/flexibility","display_name":"Flexibility (engineering)","score":0.5835999846458435},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.555899977684021},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4381999969482422},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4196999967098236},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.3192000091075897},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.30630001425743103}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.745199978351593},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.5835999846458435},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.555899977684021},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.526199996471405},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4381999969482422},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4196999967098236},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3192000091075897},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.30630001425743103},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.29600000381469727},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2919999957084656},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2915000021457672},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.25589999556541443},{"id":"https://openalex.org/C2983327147","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Image denoising","level":3,"score":0.25529998540878296}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.13585","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.13585","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.13585","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.13585","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.4152913987636566}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"text-to-image":[1],"(T2I)":[2],"diffusion":[3],"models":[4,157],"have":[5],"achieved":[6],"remarkable":[7],"advancement,":[8],"yet":[9],"faithfully":[10],"following":[11],"complex":[12],"textual":[13,22,36,101],"descriptions":[14],"remains":[15],"challenging":[16],"due":[17],"to":[18,61,104],"insufficient":[19],"interactions":[20,30,46,78],"between":[21],"and":[23,42,50,58,76,82,100,129,138,148,156],"visual":[24,146],"features.":[25],"Prior":[26],"approaches":[27],"enhance":[28],"such":[29],"via":[31],"architectural":[32],"design":[33],"or":[34],"handcrafted":[35],"condition":[37],"weighting,":[38],"but":[39],"lack":[40],"flexibility":[41],"overlook":[43],"the":[44],"dynamic":[45],"across":[47,79,151],"different":[48,97],"blocks":[49,81],"denoising":[51,83],"stages.":[52],"To":[53],"provide":[54],"a":[55,67,110],"more":[56],"flexible":[57],"efficient":[59],"solution":[60],"this":[62],"problem,":[63],"we":[64],"propose":[65],"Diff-Aid,":[66],"lightweight":[68],"inference-time":[69],"method":[70],"that":[71,94],"adaptively":[72],"adjusts":[73],"per-token":[74],"text":[75],"image":[77],"transformer":[80],"timesteps.":[84],"Beyond":[85],"improving":[86],"generation":[87],"quality,":[88,147],"Diff-Aid":[89,113],"yields":[90],"interpretable":[91],"modulation":[92],"patterns":[93],"reveal":[95],"how":[96],"blocks,":[98],"timesteps,":[99],"tokens":[102],"contribute":[103],"semantic":[105],"alignment":[106],"during":[107],"denoising.":[108],"As":[109],"plug-and-play":[111],"module,":[112],"can":[114],"be":[115,159],"seamlessly":[116],"integrated":[117],"into":[118],"downstream":[119],"applications":[120],"for":[121],"further":[122],"improvement,":[123],"including":[124],"style":[125],"LoRAs,":[126],"controllable":[127],"generation,":[128],"zero-shot":[130],"editing.":[131],"Experiments":[132],"on":[133],"strong":[134],"baselines":[135],"(SD":[136],"3.5":[137],"FLUX)":[139],"demonstrate":[140],"consistent":[141],"improvements":[142],"in":[143],"prompt":[144],"adherence,":[145],"human":[149],"preference":[150],"various":[152],"metrics.":[153],"Our":[154],"code":[155],"will":[158],"released.":[160]},"counts_by_year":[],"updated_date":"2026-02-18T06:25:47.457606","created_date":"2026-02-18T00:00:00"}
