{"id":"https://openalex.org/W7127435059","doi":"https://doi.org/10.48550/arxiv.2602.00381","title":"Modeling Image-Caption Rating from Comparative Judgments","display_name":"Modeling Image-Caption Rating from Comparative Judgments","publication_year":2026,"publication_date":"2026-01-30","ids":{"openalex":"https://openalex.org/W7127435059","doi":"https://doi.org/10.48550/arxiv.2602.00381"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.00381","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124914013","display_name":"Kezia Minni","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Minni, Kezia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124892021","display_name":"Qiang Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Qiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056492391","display_name":"Monoshiz Mahbub Khan","orcid":"https://orcid.org/0009-0004-7557-4751"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khan, Monoshiz Mahbub","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101499129","display_name":"Zhe Yu","orcid":"https://orcid.org/0000-0002-1972-635X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Zhe","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5124914013"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9825000166893005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9825000166893005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.0032999999821186066,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.002899999963119626,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.6988000273704529},{"id":"https://openalex.org/keywords/regression","display_name":"Regression","score":0.5465999841690063},{"id":"https://openalex.org/keywords/rank","display_name":"Rank (graph theory)","score":0.49390000104904175},{"id":"https://openalex.org/keywords/regression-analysis","display_name":"Regression analysis","score":0.4690999984741211},{"id":"https://openalex.org/keywords/ordinal-regression","display_name":"Ordinal regression","score":0.3221000134944916},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.29739999771118164},{"id":"https://openalex.org/keywords/learning-to-rank","display_name":"Learning to rank","score":0.2833999991416931}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7085000276565552},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.6988000273704529},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.647599995136261},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5924999713897705},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.5465999841690063},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.49390000104904175},{"id":"https://openalex.org/C152877465","wikidata":"https://www.wikidata.org/wiki/Q208042","display_name":"Regression analysis","level":2,"score":0.4690999984741211},{"id":"https://openalex.org/C110313322","wikidata":"https://www.wikidata.org/wiki/Q7100793","display_name":"Ordinal regression","level":2,"score":0.3221000134944916},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.29739999771118164},{"id":"https://openalex.org/C86037889","wikidata":"https://www.wikidata.org/wiki/Q4330127","display_name":"Learning to rank","level":3,"score":0.2833999991416931},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2815999984741211},{"id":"https://openalex.org/C181204326","wikidata":"https://www.wikidata.org/wiki/Q7239820","display_name":"Preference learning","level":3,"score":0.2775000035762787},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.27639999985694885},{"id":"https://openalex.org/C48921125","wikidata":"https://www.wikidata.org/wiki/Q10861030","display_name":"Linear regression","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2662999927997589},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.25999999046325684}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.00381","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.00381","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.00381","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.00381","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Image":[0],"caption":[1],"rating":[2,17],"is":[3,25,34,240],"becoming":[4],"increasingly":[5],"important":[6],"because":[7],"computer-generated":[8],"captions":[9,21],"are":[10],"used":[11],"extensively":[12],"for":[13,37,242],"descriptive":[14],"annotation.":[15],"However,":[16],"the":[18,81,111,115,120,131,138,148,167,181,195,261],"accuracy":[19],"of":[20,66,114,180,199,233],"in":[22,29,80],"describing":[23],"images":[24],"time-consuming":[26],"and":[27,101,109,197,204,216,246],"subjective":[28],"nature.":[30],"In":[31,51,184],"contrast,":[32],"it":[33],"often":[35],"easier":[36],"people":[38],"to":[39,75,118,147,178,193],"compare":[40,194],"(between":[41],"two":[42],"pairs)":[43],"which":[44],"image-caption":[45,78,164],"pair":[46,157,165],"better":[47,159],"matches":[48,160],"each":[49,161],"other.":[50],"this":[52],"study,":[53],"we":[54,98],"propose":[55],"a":[56,85,94,105,172,186],"machine":[57],"learning":[58,112,150,169],"framework":[59],"that":[60,179,210,228],"models":[61],"such":[62,253],"comparative":[63,149,154,168,211,230,254],"judgments":[64,155,212,231,255],"instead":[65,232],"direct":[67,90,200,223,234,265],"ratings.":[68,91,224,266],"The":[69,141,207,249],"model":[70,87,108,117,121,126,133,143,170,250,262],"can":[71,256],"then":[72],"be":[73],"applied":[74,146],"rank":[76],"unseen":[77],"pairs":[79],"same":[82,142],"way":[83],"as":[84,236,258,260],"regression":[86,96,125,182],"trained":[88,251,263],"on":[89,137,153,252,264],"Inspired":[92],"by":[93],"state-of-the-art":[95],"approach,":[97],"extracted":[99],"visual":[100],"text":[102],"features":[103],"using":[104],"pre-trained":[106],"ViLBERT":[107],"tweaked":[110],"parameters":[113],"baseline":[116,132],"improve":[119],"performance.":[122],"This":[123],"new":[124],"(with":[127,134,175],"Kendall's":[128,135,176],"$\u03c4_c=0.812$)":[129],"outperformed":[130],"$\u03c4_c=0.758$)":[136],"VICR":[139],"dataset.":[140],"structure":[144],"was":[145,191],"framework.":[151],"Trained":[152],"(image-caption":[156],"A":[158],"other":[162],"than":[163,222],"B),":[166],"achieved":[171],"performance":[173],"similar":[174],"$\u03c4_c=0.804$)":[177],"model.":[183],"addition,":[185],"small-scale":[187],"human":[188,220],"subject":[189],"study":[190],"conducted":[192],"cost":[196],"quality":[198],"ratings,":[201],"pairwise":[202],"comparisons,":[203],"same-image":[205],"comparisons.":[206],"results":[208,215,226],"showed":[209],"yielded":[213],"faster":[214],"greater":[217,247],"agreement":[218],"among":[219],"annotators":[221],"These":[225],"suggest":[227],"collecting":[229],"ratings":[235],"training":[237],"data":[238],"labels":[239],"promising":[241],"lower":[243],"annotation":[244],"costs":[245],"consistency.":[248],"perform":[257],"well":[259]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-04T00:00:00"}
