{"id":"https://openalex.org/W7154723878","doi":"https://doi.org/10.48550/arxiv.2604.15210","title":"Learning to Think Like a Cartoon Captionist: Incongruity-Resolution Supervision for Multimodal Humor Understanding","display_name":"Learning to Think Like a Cartoon Captionist: Incongruity-Resolution Supervision for Multimodal Humor Understanding","publication_year":2026,"publication_date":"2026-04-16","ids":{"openalex":"https://openalex.org/W7154723878","doi":"https://doi.org/10.48550/arxiv.2604.15210"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.15210","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15210","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.15210","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133836892","display_name":"Hatice Merve Vural","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Vural, Hatice Merve","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133838071","display_name":"Doga Kukul","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kukul, Doga","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133857979","display_name":"Ege Erdem Ozlu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ozlu, Ege Erdem","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133842198","display_name":"Demir Ekin Arikan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Arikan, Demir Ekin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045277749","display_name":"Bob Mankoff","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mankoff, Bob","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133833148","display_name":"Erkut Erdem","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Erdem, Erkut","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133866556","display_name":"Aykut Erdem","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Erdem, Aykut","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5133836892"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11795","display_name":"Humor Studies and Applications","score":0.945900022983551,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11795","display_name":"Humor Studies and Applications","score":0.945900022983551,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11148","display_name":"Language, Metaphor, and Cognition","score":0.011099999770522118,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.009600000455975533,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.5460000038146973},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5425000190734863},{"id":"https://openalex.org/keywords/contest","display_name":"CONTEST","score":0.5022000074386597},{"id":"https://openalex.org/keywords/interpretation","display_name":"Interpretation (philosophy)","score":0.4657000005245209},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4300999939441681},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.40380001068115234},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.39899998903274536},{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.3711000084877014},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.3612000048160553}],"concepts":[{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.5460000038146973},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5425000190734863},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.5249000191688538},{"id":"https://openalex.org/C2777582232","wikidata":"https://www.wikidata.org/wiki/Q5013414","display_name":"CONTEST","level":2,"score":0.5022000074386597},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.4657000005245209},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.46470001339912415},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.4366999864578247},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4300999939441681},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.42010000348091125},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.40380001068115234},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.39899998903274536},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.3711000084877014},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.3612000048160553},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3544999957084656},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.3531999886035919},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34439998865127563},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.336899995803833},{"id":"https://openalex.org/C37228920","wikidata":"https://www.wikidata.org/wiki/Q1307600","display_name":"Experiential learning","level":2,"score":0.31630000472068787},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.28870001435279846},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.27469998598098755},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.27149999141693115},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.2621000111103058},{"id":"https://openalex.org/C103057564","wikidata":"https://www.wikidata.org/wiki/Q4751139","display_name":"Analytic reasoning","level":3,"score":0.2619999945163727},{"id":"https://openalex.org/C2780297707","wikidata":"https://www.wikidata.org/wiki/Q4895393","display_name":"Landmark","level":2,"score":0.26179999113082886},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.25999999046325684},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.2547999918460846},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.25360000133514404},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2533999979496002}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.15210","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15210","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.15210","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15210","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.45940184593200684,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Humor":[0],"is":[1,178],"one":[2],"of":[3,82],"the":[4,10,18,31,46,73,113],"few":[5],"cognitive":[6],"tasks":[7],"where":[8],"getting":[9,17],"reasoning":[11,48,106,165,172],"right":[12],"matters":[13],"as":[14,16,30,42],"much":[15],"answer":[19],"right.":[20],"While":[21],"recent":[22],"work":[23],"evaluates":[24,89],"humor":[25,51,62],"understanding":[26,63],"on":[27,130,153],"benchmarks":[28,159],"such":[29],"New":[32],"Yorker":[33],"Cartoon":[34],"Caption":[35],"Contest":[36],"(NYCC),":[37],"it":[38,41],"largely":[39],"treats":[40],"black-box":[43],"prediction,":[44],"overlooking":[45],"structured":[47,109],"processes":[49],"underlying":[50],"comprehension.":[52],"We":[53],"introduce":[54],"IRS":[55,103,132,162],"(Incongruity-Resolution":[56],"Supervision),":[57],"a":[58],"framework":[59],"that":[60,111,161,170],"decomposes":[61],"into":[64],"three":[65],"components:":[66],"incongruity":[67],"modeling,":[68,77],"which":[69,78,88],"identifies":[70],"mismatches":[71],"in":[72,96],"visual":[74,116],"scene;":[75],"resolution":[76],"constructs":[79],"coherent":[80],"reinterpretations":[81],"these":[83],"mismatches;":[84],"and":[85,99,122,127,136,143],"preference":[86],"alignment,":[87],"candidate":[90],"interpretations":[91],"under":[92],"human":[93],"judgments.":[94],"Grounded":[95],"incongruity-resolution":[97],"theory":[98],"expert":[100],"captionist":[101],"practice,":[102],"supervises":[104],"intermediate":[105],"process":[107],"through":[108],"traces":[110],"make":[112],"path":[114],"from":[115],"perception":[117],"to":[118,157],"humorous":[119],"interpretation":[120],"explicit":[121],"learnable.":[123],"Across":[124],"7B,":[125],"32B,":[126],"72B":[128],"models":[129],"NYCC,":[131],"outperforms":[133],"strong":[134],"open":[135],"closed":[137],"multimodal":[138],"baselines":[139],"across":[140],"caption":[141],"matching":[142],"ranking":[144],"tasks,":[145],"with":[146],"our":[147],"largest":[148],"model":[149],"approaching":[150],"expert-level":[151],"performance":[152],"ranking.":[154],"Zero-shot":[155],"transfer":[156],"external":[158],"shows":[160],"learns":[163],"generalizable":[164],"patterns.":[166],"Our":[167],"results":[168],"suggest":[169],"supervising":[171],"structure,":[173],"rather":[174],"than":[175],"scale":[176],"alone,":[177],"key":[179],"for":[180],"reasoning-centric":[181],"tasks.":[182]},"counts_by_year":[],"updated_date":"2026-04-18T06:05:20.339008","created_date":"2026-04-18T00:00:00"}
