{"id":"https://openalex.org/W4414360623","doi":"https://doi.org/10.24963/ijcai.2025/232","title":"TP-Eval: Tap Multimodal LLMs' Potential in Evaluation by Customizing Prompts","display_name":"TP-Eval: Tap Multimodal LLMs' Potential in Evaluation by Customizing Prompts","publication_year":2025,"publication_date":"2025-09-01","ids":{"openalex":"https://openalex.org/W4414360623","doi":"https://doi.org/10.24963/ijcai.2025/232"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2025/232","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/232","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114858444","display_name":"Yuxuan Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]},{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yuxuan Xie","raw_affiliation_strings":["School of Computer Science, Shanghai Jiao Tong","Shanghai Artificial Intelligence Laboratory","Zhiyuan College, Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Shanghai Jiao Tong","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Shanghai Artificial Intelligence Laboratory","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4391012619"]},{"raw_affiliation_string":"Zhiyuan College, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101428262","display_name":"Tianhua Li","orcid":"https://orcid.org/0000-0003-3906-4602"},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]},{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianhua Li","raw_affiliation_strings":["School of Computer Science, Shanghai Jiao Tong","Shanghai Artificial Intelligence Laboratory"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Shanghai Jiao Tong","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Shanghai Artificial Intelligence Laboratory","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4391012619"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101827257","display_name":"Wenqi Shao","orcid":"https://orcid.org/0000-0003-3781-4086"},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenqi Shao","raw_affiliation_strings":["Shanghai Artificial Intelligence Laboratory"],"affiliations":[{"raw_affiliation_string":"Shanghai Artificial Intelligence Laboratory","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4391012619"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5036606244","display_name":"Kaipeng Zhang","orcid":"https://orcid.org/0000-0001-6105-6532"},"institutions":[{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]},{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kaipeng Zhang","raw_affiliation_strings":["Shanghai Artificial Intelligence Laboratory"],"affiliations":[{"raw_affiliation_string":"Shanghai Artificial Intelligence Laboratory","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4391012619"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5114858444"],"corresponding_institution_ids":["https://openalex.org/I183067930","https://openalex.org/I4210100255","https://openalex.org/I4391012619"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.14011152,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2081","last_page":"2088"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.7892000079154968,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.7892000079154968,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/personalization","display_name":"Personalization","score":0.7360000014305115},{"id":"https://openalex.org/keywords/evaluation-methods","display_name":"Evaluation methods","score":0.32330000400543213},{"id":"https://openalex.org/keywords/sensitivity","display_name":"Sensitivity (control systems)","score":0.3125999867916107},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.2793999910354614}],"concepts":[{"id":"https://openalex.org/C183003079","wikidata":"https://www.wikidata.org/wiki/Q1000371","display_name":"Personalization","level":2,"score":0.7360000014305115},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7333999872207642},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.44830000400543213},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36250001192092896},{"id":"https://openalex.org/C3018395757","wikidata":"https://www.wikidata.org/wiki/Q1379672","display_name":"Evaluation methods","level":2,"score":0.32330000400543213},{"id":"https://openalex.org/C21200559","wikidata":"https://www.wikidata.org/wiki/Q7451068","display_name":"Sensitivity (control systems)","level":2,"score":0.3125999867916107},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.29440000653266907},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.2793999910354614},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.27869999408721924},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2648000121116638},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2619999945163727}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963/ijcai.2025/232","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/232","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recently,":[0],"multimodal":[1],"large":[2],"language":[3],"models":[4,63,78],"(MLLMs)":[5],"have":[6,64],"received":[7],"much":[8],"attention":[9],"for":[10,67,76,124,134],"their":[11],"impressive":[12],"capabilities.":[13],"The":[14],"evaluation":[15,81,96,108,169],"of":[16,24,36,141,149],"MLLMs":[17,25],"is":[18],"becoming":[19],"critical":[20],"to":[21,45,106,120,138,152],"analyzing":[22],"attributes":[23],"and":[26,70,91,110,156,166],"providing":[27],"valuable":[28],"insights.":[29],"However,":[30],"current":[31],"benchmarks":[32,90],"overlook":[33],"the":[34,54,58,73,117,139,147,160],"problem":[35],"prompt":[37,41,75,103,135],"sensitivity":[38],"-":[39],"minor":[40],"variations":[42],"may":[43,52],"lead":[44],"significant":[46],"performance":[47],"fluctuations.":[48],"Thus,":[49],"inappropriate":[50],"prompts":[51,119,123],"obscure":[53],"models'":[55,59,112,154],"capabilities,":[56,155],"underestimating":[57],"performance.":[60],"Moreover,":[61],"different":[62,65,68,121,125],"preferences":[66],"prompts,":[69],"thus,":[71],"using":[72],"same":[74],"all":[77],"will":[79,115],"cause":[80],"bias.":[82],"This":[83],"paper":[84],"analyzes":[85],"this":[86],"deficiency":[87],"in":[88,162],"existing":[89],"further":[92],"introduces":[93,101],"a":[94,102],"new":[95],"framework":[97],"named":[98],"TP-Eval,":[99],"which":[100],"customization":[104,136],"method":[105],"reduce":[107],"biases":[109],"tap":[111],"potential.":[113],"TP-Eval":[114,157],"rewrite":[116],"original":[118],"customized":[122],"models.":[126],"In":[127],"particular,":[128],"we":[129],"propose":[130],"some":[131],"well-designed":[132],"modules":[133],"tailored":[137],"scenario":[140],"MLLM":[142,168],"evaluation.":[143],"Extensive":[144],"experiments":[145],"demonstrate":[146],"effectiveness":[148],"our":[150],"approach":[151],"uncovering":[153],"should":[158],"benefit":[159],"community":[161],"developing":[163],"more":[164],"comprehensive":[165],"convincing":[167],"benchmarks.":[170]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
