{"id":"https://openalex.org/W4416016898","doi":"https://doi.org/10.1145/3746252.3761264","title":"Rethinking the Training Paradigm of Discrete Token-Based Multimodal LLMs: An Analysis of Text-Centric Bias","display_name":"Rethinking the Training Paradigm of Discrete Token-Based Multimodal LLMs: An Analysis of Text-Centric Bias","publication_year":2025,"publication_date":"2025-11-07","ids":{"openalex":"https://openalex.org/W4416016898","doi":"https://doi.org/10.1145/3746252.3761264"},"language":null,"primary_location":{"id":"doi:10.1145/3746252.3761264","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761264","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746252.3761264","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120301481","display_name":"Wansik Jo","orcid":"https://orcid.org/0009-0001-8957-0223"},"institutions":[{"id":"https://openalex.org/I119561980","display_name":"Umicore (Belgium)","ror":"https://ror.org/00zwdnt53","country_code":"BE","type":"company","lineage":["https://openalex.org/I119561980"]}],"countries":["BE"],"is_corresponding":true,"raw_author_name":"Wansik Jo","raw_affiliation_strings":["Meritzfire, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0001-8957-0223","affiliations":[{"raw_affiliation_string":"Meritzfire, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I119561980"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120301482","display_name":"Jooyeong Na","orcid":null},"institutions":[{"id":"https://openalex.org/I57664883","display_name":"Ajou University","ror":"https://ror.org/03tzb2h73","country_code":"KR","type":"education","lineage":["https://openalex.org/I57664883"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jooyeong Na","raw_affiliation_strings":["Ajou University, Suwon, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0007-7870-8681","affiliations":[{"raw_affiliation_string":"Ajou University, Suwon, Republic of Korea","institution_ids":["https://openalex.org/I57664883"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102902059","display_name":"Soyeon Hong","orcid":"https://orcid.org/0009-0006-8766-3879"},"institutions":[{"id":"https://openalex.org/I57664883","display_name":"Ajou University","ror":"https://ror.org/03tzb2h73","country_code":"KR","type":"education","lineage":["https://openalex.org/I57664883"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Soyeon Hong","raw_affiliation_strings":["Ajou University, Suwon, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0006-8766-3879","affiliations":[{"raw_affiliation_string":"Ajou University, Suwon, Republic of Korea","institution_ids":["https://openalex.org/I57664883"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103182620","display_name":"Seungtaek Choi","orcid":"https://orcid.org/0000-0003-3570-0907"},"institutions":[{"id":"https://openalex.org/I83436808","display_name":"Hankuk University of Foreign Studies","ror":"https://ror.org/051q2m369","country_code":"KR","type":"education","lineage":["https://openalex.org/I83436808"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Seungtaek Choi","raw_affiliation_strings":["Hankuk University of Foreign Studies, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0000-0003-3570-0907","affiliations":[{"raw_affiliation_string":"Hankuk University of Foreign Studies, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I83436808"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5044711814","display_name":"Hyunsouk Cho","orcid":null},"institutions":[{"id":"https://openalex.org/I57664883","display_name":"Ajou University","ror":"https://ror.org/03tzb2h73","country_code":"KR","type":"education","lineage":["https://openalex.org/I57664883"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hyunsouk Cho","raw_affiliation_strings":["Ajou University, Suwon, Republic of Korea"],"raw_orcid":"https://orcid.org/0000-0002-9134-1921","affiliations":[{"raw_affiliation_string":"Ajou University, Suwon, Republic of Korea","institution_ids":["https://openalex.org/I57664883"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5120301481"],"corresponding_institution_ids":["https://openalex.org/I119561980"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.16762907,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1249","last_page":"1259"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.5264000296592712,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.5264000296592712,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.094200000166893,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.08299999684095383,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.7218999862670898},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.6460999846458435},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6003000140190125},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5113000273704529},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.492900013923645},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.4740000069141388},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.39660000801086426},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.39079999923706055}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7495999932289124},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.7218999862670898},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.6460999846458435},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6003000140190125},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5742999911308289},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5113000273704529},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.492900013923645},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.478300005197525},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.4740000069141388},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.39660000801086426},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.39079999923706055},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38119998574256897},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3772999942302704},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.37720000743865967},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.3709000051021576},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.3463999927043915},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3434000015258789},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.32339999079704285},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2930999994277954},{"id":"https://openalex.org/C73000952","wikidata":"https://www.wikidata.org/wiki/Q17007827","display_name":"Discretization","level":2,"score":0.2851000130176544},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.2842000126838684},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.27469998598098755},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.258899986743927}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746252.3761264","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761264","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3746252.3761264","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761264","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2108598243","https://openalex.org/W2149741699","https://openalex.org/W2560730294","https://openalex.org/W2906152891","https://openalex.org/W3012624518","https://openalex.org/W3015700067","https://openalex.org/W3099884329","https://openalex.org/W3120485916","https://openalex.org/W3133702157","https://openalex.org/W3168771811","https://openalex.org/W3180303049","https://openalex.org/W3180355996","https://openalex.org/W4312261477","https://openalex.org/W4372260310","https://openalex.org/W4386065611","https://openalex.org/W4392909390","https://openalex.org/W4401955911","https://openalex.org/W4402716330","https://openalex.org/W4402772272","https://openalex.org/W4413145813","https://openalex.org/W4415795709"],"related_works":[],"abstract_inverted_index":{"Discrete":[0],"token-based":[1,43,199],"multimodal":[2,182,208],"large":[3],"language":[4,27],"models":[5],"(MLLMs),":[6],"such":[7,32],"as":[8,33,82],"AnyGPT":[9],"and":[10,35,50,110,203,213],"MIO,":[11],"integrate":[12],"diverse":[13],"modalities":[14],"into":[15,23],"an":[16,83,104,128],"autoregressive":[17],"framework":[18,106],"by":[19,162],"discretizing":[20],"modality":[21,47,92],"inputs":[22],"tokens":[24],"compatible":[25],"with":[26,53],"models.":[28],"Unlike":[29],"encoder-based":[30],"approaches,":[31],"LLaVA":[34],"Flamingo,":[36],"which":[37,133],"utilize":[38],"pretrained":[39],"modality-specific":[40],"encoders,":[41],"discrete":[42,198],"MLLMs":[44],"simultaneously":[45],"learn":[46],"token":[48],"representations":[49],"their":[51],"alignment":[52],"the":[54,86,97,118,122,157,163],"language,":[55],"yet":[56],"are":[57,215],"exclusively":[58],"trained":[59],"on":[60,85],"modality-text":[61],"paired":[62],"datasets":[63],"without":[64],"additional":[65],"unimodal":[66,137,147],"training.":[67],"We":[68],"identify":[69],"a":[70,135,193],"structural":[71,177],"limitation":[72,195],"inherent":[73],"in":[74,196],"this":[75,100,176],"training":[76,124,138,165,201,209],"paradigm,":[77,125],"termed":[78],"text-centric":[79,151],"bias,":[80,101,152],"defined":[81],"over-reliance":[84],"textual":[87,188],"context":[88],"that":[89,143,156,175],"restricts":[90],"intrinsic":[91],"understanding.":[93],"To":[94],"systematically":[95],"analyze":[96],"existence":[98],"of":[99],"we":[102,126,172],"propose":[103],"analytical":[105,129],"involving":[107],"external":[108],"perplexity-based":[109],"internal":[111],"neuron-level":[112],"analyses.":[113],"Furthermore,":[114],"to":[115,146],"verify":[116],"whether":[117],"bias":[119,158,178],"originates":[120],"from":[121],"paired-only":[123,164],"introduce":[127],"methodology":[130],"named":[131],"Monotune,":[132],"is":[134,159],"simple":[136],"stage.":[139],"Our":[140,190,211],"analyses":[141],"demonstrate":[142],"minimal":[144],"exposure":[145],"data":[148],"effectively":[149],"mitigates":[150],"providing":[153],"empirical":[154],"evidence":[155],"fundamentally":[160],"induced":[161],"strategy.":[166],"Through":[167],"comprehensive":[168],"downstream":[169],"task":[170,183],"evaluations,":[171],"further":[173],"reveal":[174],"meaningfully":[179],"affects":[180],"real-world":[181],"performance,":[184],"particularly":[185],"under":[186],"limited":[187],"contexts.":[189],"findings":[191],"highlight":[192],"fundamental":[194],"current":[197],"MLLM":[200],"paradigms":[202],"suggest":[204],"directions":[205],"for":[206],"future":[207],"strategies.":[210],"code":[212],"experiments":[214],"available":[216],"at":[217],"https://github.com/41312432/Monotune":[218]},"counts_by_year":[],"updated_date":"2025-11-08T23:25:12.792448","created_date":"2025-11-08T00:00:00"}
