{"id":"https://openalex.org/W4410049264","doi":"https://doi.org/10.1145/3722573.3727826","title":"Simulation vs. Hallucination: Assessing Vision-Language Model Question Answering Capabilities in Engineering Simulations","display_name":"Simulation vs. Hallucination: Assessing Vision-Language Model Question Answering Capabilities in Engineering Simulations","publication_year":2025,"publication_date":"2025-05-03","ids":{"openalex":"https://openalex.org/W4410049264","doi":"https://doi.org/10.1145/3722573.3727826"},"language":"en","primary_location":{"id":"doi:10.1145/3722573.3727826","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3722573.3727826","pdf_url":null,"source":null,"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 7th Workshop on Design Automation for CPS and IoT","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3722573.3727826","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088296272","display_name":"Jessica Ezemba","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jessica Ezemba","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, Pennsylvania, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005893539","display_name":"Christopher McComb","orcid":"https://orcid.org/0000-0002-5024-7701"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Christopher McComb","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, Pennsylvania, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018232256","display_name":"Conrad S. Tucker","orcid":"https://orcid.org/0000-0001-5365-0240"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Conrad Tucker","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, Pennsylvania, USA","institution_ids":["https://openalex.org/I74973139"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5088296272"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.14884156,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10809","display_name":"Occupational Health and Safety Research","score":0.9240000247955322,"subfield":{"id":"https://openalex.org/subfields/3614","display_name":"Radiological and Ultrasound Technology"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T10809","display_name":"Occupational Health and Safety Research","score":0.9240000247955322,"subfield":{"id":"https://openalex.org/subfields/3614","display_name":"Radiological and Ultrasound Technology"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.910099983215332,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12574","display_name":"Clinical Reasoning and Diagnostic Skills","score":0.906499981880188,"subfield":{"id":"https://openalex.org/subfields/2714","display_name":"Family Practice"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.691403329372406},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5251628756523132},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5189344882965088},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.5096261501312256},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.454490602016449}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.691403329372406},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5251628756523132},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5189344882965088},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.5096261501312256},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.454490602016449}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3722573.3727826","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3722573.3727826","pdf_url":null,"source":null,"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 7th Workshop on Design Automation for CPS and IoT","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3722573.3727826","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3722573.3727826","pdf_url":null,"source":null,"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 7th Workshop on Design Automation for CPS and IoT","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W3159959439","https://openalex.org/W4285999563","https://openalex.org/W4386076314","https://openalex.org/W4388205333","https://openalex.org/W4388223983","https://openalex.org/W4389450167","https://openalex.org/W4395109783","https://openalex.org/W4399758723","https://openalex.org/W4401362547","https://openalex.org/W4403791333","https://openalex.org/W4405064339"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2384605597","https://openalex.org/W2387743295","https://openalex.org/W4288267738","https://openalex.org/W3204019825","https://openalex.org/W2964413124","https://openalex.org/W4388937922","https://openalex.org/W3113264705"],"abstract_inverted_index":{"Engineering":[0],"simulations":[1,18],"generate":[2],"complex":[3],"multimodal":[4,37,108],"data":[5,46,81],"that":[6,129,139,202],"are":[7],"crucial":[8],"for":[9,43,103,168,188,204],"design":[10],"iteration":[11],"and":[12,24,77,87,126,162],"validation.":[13],"The":[14],"interpretation":[15,117,216],"of":[16,51,68,118,217],"these":[17,52],"traditionally":[19],"requires":[20],"significant":[21,208],"domain":[22,124],"expertise":[23],"cognitive":[25],"effort.":[26],"Recently,":[27],"vision-language":[28],"models":[29,53],"(VLMs)":[30],"have":[31],"demonstrated":[32],"impressive":[33],"capabilities":[34,167],"in":[35,54,210],"general-domain":[36],"reasoning":[38,128],"tasks,":[39],"offering":[40],"the":[41,49,115,165,215],"potential":[42],"automating":[44],"simulation":[45,80,120],"interpretation.":[47],"However,":[48,196],"effectiveness":[50],"specialized":[55,123],"engineering":[56,104,119,205,218],"contexts":[57],"remains":[58],"largely":[59],"unexplored.":[60],"This":[61],"paper":[62],"presents":[63],"an":[64],"initial":[65],"comparative":[66],"evaluation":[67,111],"state-of-the-art":[69],"VLMs":[70,213],"on":[71,114,172,176],"question":[72],"answering":[73],"tasks":[74,179],"involving":[75],"structural":[76,177],"fluid":[78,181],"dynamics":[79,182],"across":[82],"three":[83],"modalities:":[84],"text,":[85],"images,":[86],"videos.":[88],"In":[89],"doing":[90],"so,":[91],"we":[92],"introduce":[93],"a":[94],"domain-specific":[95],"benchmark":[96],"dataset":[97],"comprising":[98],"true/false":[99],"questions":[100],"testing":[101],"comprehension":[102],"simulations.":[105,219],"Unlike":[106],"general-purpose":[107],"benchmarks,":[109],"our":[110],"focuses":[112],"specifically":[113],"technical":[116],"outputs,":[121],"requiring":[122],"knowledge":[125],"physical":[127],"is":[130,198],"absent":[131],"from":[132],"broader":[133],"multimedia":[134],"assessments.":[135],"Our":[136],"results":[137],"demonstrate":[138],"text":[140,169],"modality":[141],"yields":[142],"substantially":[143],"higher":[144],"performance":[145],"(up":[146],"to":[147,214],"69.2%":[148],"accuracy":[149],"with":[150,153,159,184],"GPT-4o,":[151,160],"66.3%":[152],"LLaVA)":[154],"than":[155,180],"visual":[156],"inputs":[157],"(52.9-55.8%),":[158],"LLaVA,":[161],"Phi-3":[163],"exhibiting":[164],"strongest":[166],"comprehension.":[170],"Models":[171],"average":[173],"performed":[174],"better":[175],"analysis":[178],"problems,":[183],"minimal":[185],"advantage":[186],"observed":[187],"native":[189],"video":[190],"processing":[191],"over":[192],"batched":[193],"image":[194],"approaches.":[195],"reliability":[197],"still":[199],"far":[200],"below":[201],"needed":[203],"applications,":[206],"highlighting":[207],"challenges":[209],"applying":[211],"current":[212]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
