{"id":"https://openalex.org/W4414903208","doi":"https://doi.org/10.1109/iccv51701.2025.01708","title":"TF-TI2I: Training-Free Text-And-Image-To-Image Generation via Multi-Modal Implicit-Context Learning in Text-To-Image Models","display_name":"TF-TI2I: Training-Free Text-And-Image-To-Image Generation via Multi-Modal Implicit-Context Learning in Text-To-Image Models","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4414903208","doi":"https://doi.org/10.1109/iccv51701.2025.01708"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.01708","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01708","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2503.15283","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113184523","display_name":"Teng-Fang Hsiao","orcid":null},"institutions":[{"id":"https://openalex.org/I148366613","display_name":"National Yang Ming Chiao Tung University","ror":"https://ror.org/00se2k293","country_code":"TW","type":"education","lineage":["https://openalex.org/I148366613"]}],"countries":["TW"],"is_corresponding":true,"raw_author_name":"Teng-Fang Hsiao","raw_affiliation_strings":["National Yang Ming Chiao Tung University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Yang Ming Chiao Tung University","institution_ids":["https://openalex.org/I148366613"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088821614","display_name":"Bo-Kai Ruan","orcid":"https://orcid.org/0000-0002-9847-3628"},"institutions":[{"id":"https://openalex.org/I148366613","display_name":"National Yang Ming Chiao Tung University","ror":"https://ror.org/00se2k293","country_code":"TW","type":"education","lineage":["https://openalex.org/I148366613"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Bo-Kai Ruan","raw_affiliation_strings":["National Yang Ming Chiao Tung University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Yang Ming Chiao Tung University","institution_ids":["https://openalex.org/I148366613"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059434130","display_name":"Yi-Lun Wu","orcid":"https://orcid.org/0000-0001-6343-1084"},"institutions":[{"id":"https://openalex.org/I148366613","display_name":"National Yang Ming Chiao Tung University","ror":"https://ror.org/00se2k293","country_code":"TW","type":"education","lineage":["https://openalex.org/I148366613"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Yi-Lun Wu","raw_affiliation_strings":["National Yang Ming Chiao Tung University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Yang Ming Chiao Tung University","institution_ids":["https://openalex.org/I148366613"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102674792","display_name":"Tzu-Ling Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I148366613","display_name":"National Yang Ming Chiao Tung University","ror":"https://ror.org/00se2k293","country_code":"TW","type":"education","lineage":["https://openalex.org/I148366613"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Tzu-Ling Lin","raw_affiliation_strings":["National Yang Ming Chiao Tung University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Yang Ming Chiao Tung University","institution_ids":["https://openalex.org/I148366613"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040050806","display_name":"Hong-Han Shuai","orcid":"https://orcid.org/0000-0003-2216-077X"},"institutions":[{"id":"https://openalex.org/I148366613","display_name":"National Yang Ming Chiao Tung University","ror":"https://ror.org/00se2k293","country_code":"TW","type":"education","lineage":["https://openalex.org/I148366613"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Hong-Han Shuai","raw_affiliation_strings":["National Yang Ming Chiao Tung University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Yang Ming Chiao Tung University","institution_ids":["https://openalex.org/I148366613"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5113184523"],"corresponding_institution_ids":["https://openalex.org/I148366613"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.24740997,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"18377","last_page":"18387"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9887999892234802,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9887999892234802,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9520000219345093,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9514999985694885,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6033999919891357},{"id":"https://openalex.org/keywords/masking","display_name":"Masking (illustration)","score":0.5907999873161316},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.5569999814033508},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4851999878883362},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4487999975681305},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.3871000111103058},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.36340001225471497},{"id":"https://openalex.org/keywords/visual-masking","display_name":"Visual masking","score":0.35409998893737793}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.819100022315979},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6147000193595886},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6033999919891357},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.5907999873161316},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.5569999814033508},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4851999878883362},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4487999975681305},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.43790000677108765},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3871000111103058},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.36340001225471497},{"id":"https://openalex.org/C2779200073","wikidata":"https://www.wikidata.org/wiki/Q18395575","display_name":"Visual masking","level":4,"score":0.35409998893737793},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31949999928474426},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3003999888896942},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2994999885559082},{"id":"https://openalex.org/C2778029271","wikidata":"https://www.wikidata.org/wiki/Q5421931","display_name":"Extension (predicate logic)","level":2,"score":0.2978000044822693},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.29490000009536743},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.28999999165534973},{"id":"https://openalex.org/C55020928","wikidata":"https://www.wikidata.org/wiki/Q3813865","display_name":"Image quality","level":3,"score":0.2896000146865845},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2849000096321106},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.27559998631477356},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.25949999690055847},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2522999942302704}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.01708","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01708","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2503.15283","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.15283","pdf_url":"https://arxiv.org/pdf/2503.15283","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2503.15283","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.15283","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2503.15283","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.15283","pdf_url":"https://arxiv.org/pdf/2503.15283","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1295885602","display_name":null,"funder_award_id":"NSTC-112-2221-E-A49-059MY3,NSTC-112-2221-E-A49-094-MY3","funder_id":"https://openalex.org/F2461203286","funder_display_name":"National Science and Technology Council"}],"funders":[{"id":"https://openalex.org/F2461203286","display_name":"National Science and Technology Council","ror":"https://ror.org/02kv4zf79"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Text-and-Image-To-Image":[0],"(TI2I),":[1],"an":[2],"extension":[3],"of":[4,117],"Text-To-Image":[5],"(T2I),":[6],"integrates":[7],"image":[8,15,22],"inputs":[9],"with":[10,40,161],"textual":[11,80],"instructions":[12],"to":[13,120],"enhance":[14,91],"generation.":[16],"Existing":[17],"methods":[18],"often":[19],"partially":[20],"utilize":[21],"inputs,":[23],"focusing":[24],"on":[25,70],"specific":[26],"elements":[27],"like":[28],"objects":[29],"or":[30,32],"styles,":[31],"they":[33],"experience":[34],"a":[35,96,153],"decline":[36],"in":[37,74,144,176],"generation":[38],"quality":[39],"complex,":[41],"multi-image":[42],"instructions.":[43],"To":[44],"overcome":[45],"these":[46],"challenges,":[47],"we":[48,76,147],"introduce":[49,149],"Training-Free":[50],"Text-and-Image-to-Image":[51],"(TF-TI2I),":[52],"which":[53,75],"adapts":[54],"cutting-edge":[55],"T2I":[56,163],"models":[57],"such":[58],"as":[59],"SD3":[60],"without":[61],"the":[62,71,115,133,142,150],"need":[63],"for":[64,137,157],"additional":[65],"training.":[66],"Our":[67,165],"method":[68],"capitalizes":[69],"MM-DiT":[72],"architecture,":[73],"point":[77],"out":[78],"that":[79],"tokens":[81,119],"can":[82],"implicitly":[83],"learn":[84],"visual":[85,98,122],"information":[86,105],"from":[87,100],"vision":[88,139],"tokens.":[89],"We":[90],"this":[92,112],"interaction":[93],"by":[94,131],"extracting":[95],"condensed":[97],"representation":[99],"reference":[101],"images,":[102],"facilitating":[103],"selective":[104],"sharing":[106],"through":[107],"Reference":[108],"Contextual":[109],"Masking":[110],"--":[111],"technique":[113],"confines":[114],"usage":[116],"contextual":[118],"instruction-relevant":[121],"information.":[123],"Additionally,":[124],"our":[125],"Winner-Takes-All":[126],"module":[127],"mitigates":[128],"distribution":[129],"shifts":[130],"prioritizing":[132],"most":[134],"pertinent":[135],"references":[136],"each":[138],"token.":[140],"Addressing":[141],"gap":[143],"TI2I":[145,158],"evaluation,":[146],"also":[148],"FG-TI2I":[151],"Bench,":[152],"comprehensive":[154],"benchmark":[155],"tailored":[156],"and":[159],"compatible":[160],"existing":[162],"methods.":[164],"approach":[166],"shows":[167],"robust":[168],"performance":[169],"across":[170],"various":[171],"benchmarks,":[172],"confirming":[173],"its":[174],"effectiveness":[175],"handling":[177],"complex":[178],"image-generation":[179],"tasks.":[180]},"counts_by_year":[],"updated_date":"2026-05-06T06:03:25.996018","created_date":"2025-10-10T00:00:00"}
