{"id":"https://openalex.org/W4409092963","doi":"https://doi.org/10.48550/arxiv.2503.20418","title":"ITA-MDT: Image-Timestep-Adaptive Masked Diffusion Transformer Framework for Image-Based Virtual Try-On","display_name":"ITA-MDT: Image-Timestep-Adaptive Masked Diffusion Transformer Framework for Image-Based Virtual Try-On","publication_year":2025,"publication_date":"2025-03-26","ids":{"openalex":"https://openalex.org/W4409092963","doi":"https://doi.org/10.48550/arxiv.2503.20418"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2503.20418","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.20418","pdf_url":"https://arxiv.org/pdf/2503.20418","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2503.20418","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046143080","display_name":"Ji Woo Hong","orcid":"https://orcid.org/0000-0002-3758-0307"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hong, Ji Woo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ton, Tri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ton, Tri","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079431040","display_name":"Trung X. Pham","orcid":"https://orcid.org/0000-0003-4177-7054"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pham, Trung X.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093047686","display_name":"Gwanhyeong Koo","orcid":"https://orcid.org/0009-0005-6455-3223"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koo, Gwanhyeong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058873046","display_name":"Sunjae Yoon","orcid":"https://orcid.org/0000-0001-7458-5273"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yoon, Sunjae","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5073287748","display_name":"Chang D. Yoo","orcid":"https://orcid.org/0000-0002-0756-7179"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yoo, Chang D.","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5046143080"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11165","display_name":"Image and Video Quality Assessment","score":0.9886000156402588,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11165","display_name":"Image and Video Quality Assessment","score":0.9886000156402588,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10688","display_name":"Image and Signal Denoising Methods","score":0.988099992275238,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11019","display_name":"Image Enhancement Techniques","score":0.9824000000953674,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.6172278523445129},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5641805529594421},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5225436091423035},{"id":"https://openalex.org/keywords/anisotropic-diffusion","display_name":"Anisotropic diffusion","score":0.45449769496917725},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4492267668247223},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.4477415978908539},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.44237348437309265},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.14304092526435852},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.13011211156845093},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.1248508095741272}],"concepts":[{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.6172278523445129},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5641805529594421},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5225436091423035},{"id":"https://openalex.org/C203504353","wikidata":"https://www.wikidata.org/wiki/Q4765461","display_name":"Anisotropic diffusion","level":3,"score":0.45449769496917725},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4492267668247223},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.4477415978908539},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.44237348437309265},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.14304092526435852},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.13011211156845093},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.1248508095741272},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2503.20418","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.20418","pdf_url":"https://arxiv.org/pdf/2503.20418","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2503.20418","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.20418","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2503.20418","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.20418","pdf_url":"https://arxiv.org/pdf/2503.20418","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2365492952","https://openalex.org/W2008174127","https://openalex.org/W2159155702","https://openalex.org/W1486519939","https://openalex.org/W2149857113","https://openalex.org/W2373576987","https://openalex.org/W2367205005","https://openalex.org/W2366833166","https://openalex.org/W2354182593","https://openalex.org/W2366836302"],"abstract_inverted_index":{"This":[0,145,208],"paper":[1],"introduces":[2],"ITA-MDT,":[3],"the":[4,18,25,62,65,109,123,126,134,152,164,167,171,184,192,200,204],"Image-Timestep-Adaptive":[5],"Masked":[6,26],"Diffusion":[7,27],"Transformer":[8,28],"Framework":[9],"for":[10,30],"Image-Based":[11],"Virtual":[12],"Try-On":[13],"(IVTON),":[14],"designed":[15],"to":[16,154,179,186,191],"overcome":[17],"limitations":[19],"of":[20,33,61,106,122,133,149,166,183,203,215],"previous":[21],"approaches":[22],"by":[23,138,226],"leveraging":[24],"(MDT)":[29],"improved":[31],"handling":[32],"both":[34],"global":[35,157,201],"garment":[36,48,142,185,206,221,231],"context":[37],"and":[38,141],"fine-grained":[39,160],"details.":[40],"The":[41],"IVTON":[42],"task":[43],"involves":[44],"seamlessly":[45],"superimposing":[46],"a":[47,53,58,83,91,115,130],"from":[49,125],"one":[50],"image":[51,127,143],"onto":[52],"person":[54,63],"in":[55,218,247],"another,":[56],"creating":[57],"realistic":[59],"depiction":[60],"wearing":[64],"specified":[66],"garment.":[67],"Unlike":[68],"conventional":[69],"diffusion-based":[70],"virtual":[71],"try-on":[72],"models":[73],"that":[74,119,236],"depend":[75],"on":[76,163],"large":[77],"pre-trained":[78],"U-Net":[79],"architectures,":[80],"ITA-MDT":[81,107,237],"leverages":[82],"lightweight,":[84],"scalable":[85],"transformer-based":[86],"denoising":[87,168,193],"diffusion":[88,139],"model":[89,153,194],"with":[90],"mask":[92],"latent":[93],"modeling":[94],"scheme,":[95],"achieving":[96],"competitive":[97],"results":[98,246],"while":[99,240],"reducing":[100],"computational":[101,224],"overhead.":[102],"A":[103],"key":[104],"component":[105],"is":[108,177],"Image-Timestep":[110],"Adaptive":[111],"Feature":[112],"Aggregator":[113],"(ITAFA),":[114],"dynamic":[116],"feature":[117,132],"aggregator":[118],"combines":[120],"all":[121],"features":[124],"encoder":[128],"into":[129],"unified":[131],"same":[135],"size,":[136],"guided":[137],"timestep":[140],"complexity.":[144],"enables":[146],"adaptive":[147],"weighting":[148],"features,":[150],"allowing":[151],"emphasize":[155],"either":[156],"information":[158,190,202],"or":[159],"details":[161,217],"based":[162],"requirements":[165],"stage.":[169],"Additionally,":[170],"Salient":[172],"Region":[173],"Extractor":[174],"(SRE)":[175],"module":[176],"presented":[178],"identify":[180],"complex":[181],"region":[182],"provide":[187],"high-resolution":[188],"local":[189],"as":[195],"an":[196],"additional":[197],"condition":[198],"alongside":[199],"full":[205],"image.":[207,232],"targeted":[209],"conditioning":[210],"strategy":[211],"enhances":[212],"detail":[213],"preservation":[214],"fine":[216],"highly":[219],"salient":[220],"regions,":[222],"optimizing":[223],"resources":[225],"avoiding":[227],"unnecessarily":[228],"processing":[229],"entire":[230],"Comparative":[233],"evaluations":[234],"confirms":[235],"improves":[238],"efficiency":[239],"maintaining":[241],"strong":[242],"performance,":[243],"reaching":[244],"state-of-the-art":[245],"several":[248],"metrics.":[249]},"counts_by_year":[],"updated_date":"2026-04-17T18:11:37.981687","created_date":"2025-10-10T00:00:00"}
