{"id":"https://openalex.org/W4405903305","doi":"https://doi.org/10.48550/arxiv.2412.19531","title":"Is Your Text-to-Image Model Robust to Caption Noise?","display_name":"Is Your Text-to-Image Model Robust to Caption Noise?","publication_year":2024,"publication_date":"2024-12-27","ids":{"openalex":"https://openalex.org/W4405903305","doi":"https://doi.org/10.48550/arxiv.2412.19531"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2412.19531","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.19531","pdf_url":"https://arxiv.org/pdf/2412.19531","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.19531","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108259382","display_name":"Weichen Yu","orcid":"https://orcid.org/0009-0003-7935-2358"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yu, Weichen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101919600","display_name":"Zidong Yang","orcid":"https://orcid.org/0000-0003-0277-3333"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Ziyan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048135392","display_name":"Shanchuan Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Shanchuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047419128","display_name":"Qi Zhao","orcid":"https://orcid.org/0000-0003-3054-8934"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Qi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101615046","display_name":"Jianyi Wang","orcid":"https://orcid.org/0000-0001-7025-3626"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jianyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037727565","display_name":"Liangke Gui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gui, Liangke","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057424614","display_name":"Matt Fredrikson","orcid":"https://orcid.org/0000-0003-1820-1698"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fredrikson, Matt","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5090730336","display_name":"Lu Jiang","orcid":"https://orcid.org/0000-0003-0286-8439"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Lu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5108259382"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.989300012588501,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9840999841690063,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.6261643171310425},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.5506649613380432},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.54240882396698},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.43462345004081726},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.389691025018692}],"concepts":[{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.6261643171310425},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5506649613380432},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.54240882396698},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.43462345004081726},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.389691025018692}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2412.19531","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.19531","pdf_url":"https://arxiv.org/pdf/2412.19531","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.19531","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.19531","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.19531","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.19531","pdf_url":"https://arxiv.org/pdf/2412.19531","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4405903305.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"In":[0,144],"text-to-image":[1],"(T2I)":[2],"generation,":[3],"a":[4,53,151],"prevalent":[5],"training":[6,140],"technique":[7],"involves":[8],"utilizing":[9],"Vision":[10],"Language":[11],"Models":[12],"(VLMs)":[13],"for":[14,93,136],"image":[15],"re-captioning.":[16],"Even":[17],"though":[18],"VLMs":[19,86],"are":[20],"known":[21],"to":[22,146,157],"exhibit":[23],"hallucination,":[24],"generating":[25],"descriptive":[26],"content":[27],"that":[28,72],"deviates":[29],"from":[30],"the":[31,34,74,100,114,123,134,163],"visual":[32],"reality,":[33],"ramifications":[35],"of":[36,116,126,165],"such":[37],"caption":[38,64,77,108,127,159],"hallucinations":[39],"on":[40,113,129],"T2I":[41,166],"generation":[42,67],"performance":[43,131],"remain":[44],"under-explored.":[45],"Through":[46],"our":[47],"empirical":[48],"investigation,":[49],"we":[50,149],"first":[51],"establish":[52],"comprehensive":[54],"dataset":[55],"comprising":[56],"VLM-generated":[57],"captions,":[58],"and":[59,95,132],"then":[60],"systematically":[61],"analyze":[62],"how":[63],"hallucination":[65,169],"influences":[66],"outcomes.":[68],"Our":[69],"findings":[70,120],"reveal":[71],"(1)":[73],"disparities":[75],"in":[76,99,107,142,170],"quality":[78,115,128],"persistently":[79],"impact":[80,125],"model":[81,130],"outputs":[82],"during":[83],"fine-tuning.":[84],"(2)":[85],"confidence":[87,155],"scores":[88],"serve":[89],"as":[90],"reliable":[91],"indicators":[92],"detecting":[94],"characterizing":[96],"noise-related":[97],"patterns":[98],"data":[101],"distribution.":[102],"(3)":[103],"even":[104],"subtle":[105],"variations":[106],"fidelity":[109],"have":[110],"significant":[111],"effects":[112],"learned":[117],"representations.":[118],"These":[119],"collectively":[121],"emphasize":[122],"profound":[124],"highlight":[133],"need":[135],"more":[137],"sophisticated":[138],"robust":[139],"algorithm":[141],"T2I.":[143],"response":[145],"these":[147],"observations,":[148],"propose":[150],"approach":[152],"leveraging":[153],"VLM":[154],"score":[156],"mitigate":[158],"noise,":[160],"thereby":[161],"enhancing":[162],"robustness":[164],"models":[167],"against":[168],"caption.":[171]},"counts_by_year":[],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
