{"id":"https://openalex.org/W4414360189","doi":"https://doi.org/10.24963/ijcai.2025/1259","title":"A Multimodal AI Dialogue System for Unified Document, Visual, and Audio Interaction","display_name":"A Multimodal AI Dialogue System for Unified Document, Visual, and Audio Interaction","publication_year":2025,"publication_date":"2025-09-01","ids":{"openalex":"https://openalex.org/W4414360189","doi":"https://doi.org/10.24963/ijcai.2025/1259"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2025/1259","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/1259","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024885017","display_name":"Yujun Feng","orcid":"https://orcid.org/0000-0001-5766-472X"},"institutions":[{"id":"https://openalex.org/I83328450","display_name":"Miami University","ror":"https://ror.org/05nbqxr67","country_code":"US","type":"education","lineage":["https://openalex.org/I83328450"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yujun Feng","raw_affiliation_strings":["Miami University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Miami University","institution_ids":["https://openalex.org/I83328450"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077703699","display_name":"Jingyi Huang","orcid":"https://orcid.org/0000-0001-5590-8954"},"institutions":[{"id":"https://openalex.org/I83328450","display_name":"Miami University","ror":"https://ror.org/05nbqxr67","country_code":"US","type":"education","lineage":["https://openalex.org/I83328450"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jingyi Huang","raw_affiliation_strings":["Miami University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Miami University","institution_ids":["https://openalex.org/I83328450"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103072946","display_name":"Yang Zhang","orcid":"https://orcid.org/0009-0006-1122-1898"},"institutions":[{"id":"https://openalex.org/I83328450","display_name":"Miami University","ror":"https://ror.org/05nbqxr67","country_code":"US","type":"education","lineage":["https://openalex.org/I83328450"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yang Zhang","raw_affiliation_strings":["Miami University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Miami University","institution_ids":["https://openalex.org/I83328450"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.11881317,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"11044","last_page":"11047"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9851999878883362,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9851999878883362,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.6384999752044678},{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.602400004863739},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5425000190734863},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5275999903678894},{"id":"https://openalex.org/keywords/user-interface","display_name":"User interface","score":0.4740000069141388},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4595000147819519},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.45739999413490295},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.39899998903274536}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8252999782562256},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.6384999752044678},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.602400004863739},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.574400007724762},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5425000190734863},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5275999903678894},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.4740000069141388},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4595000147819519},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.45739999413490295},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.39899998903274536},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3659000098705292},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.3528999984264374},{"id":"https://openalex.org/C190954187","wikidata":"https://www.wikidata.org/wiki/Q5270587","display_name":"Dialog system","level":3,"score":0.3434000015258789},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.3190000057220459},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.30559998750686646},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.29840001463890076},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.29120001196861267},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.289900004863739},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.28139999508857727},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2770000100135803},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2685000002384186},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.26089999079704285},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.258899986743927}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963/ijcai.2025/1259","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/1259","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0,130],"paper":[1],"presents":[2],"a":[3,20,66],"multimodal":[4,67,137],"intelligent":[5],"dialogue":[6],"system":[7,25,86],"that":[8],"seamlessly":[9],"integrates":[10],"document":[11,37,60,115],"analysis,":[12,38],"visual":[13],"media":[14],"processing,":[15],"and":[16,42,48,73,75,92,101,119,153],"audio":[17,91,99,120],"interaction":[18],"within":[19],"unified":[21],"web":[22],"interface.":[23],"The":[24,85],"ensures":[26],"secure":[27],"user":[28,151],"identity":[29],"verification":[30],"through":[31,62],"persistent":[32,77],"conversational":[33,83],"management,":[34],"leveraging":[35],"textual":[36],"dynamic":[39],"context":[40,125],"integration,":[41],"cross-media":[43],"interactions":[44,96],"via":[45],"video,":[46],"image,":[47],"real-time":[49],"speech":[50],"processing.":[51],"Our":[52],"approach":[53],"introduces":[54],"three":[55],"key":[56],"innovations:":[57],"(1)":[58],"context-aware":[59],"analysis":[61],"text":[63,103],"extraction,":[64],"(2)":[65],"input":[68,100,138],"pipeline":[69],"supporting":[70],"images,":[71],"videos,":[72],"audio,":[74],"(3)":[76],"chat":[78],"history":[79],"management":[80],"for":[81,114,149],"maintaining":[82],"continuity.":[84],"facilitates":[87],"seamless":[88],"transitions":[89],"between":[90],"text,":[93],"enabling":[94],"natural":[95],"by":[97],"processing":[98],"converting":[102],"responses":[104],"into":[105],"speech.":[106],"Additionally,":[107],"the":[108,133],"platform":[109],"provides":[110],"an":[111,140],"intuitive":[112],"interface":[113],"uploads,":[116],"camera":[117],"capture,":[118],"recording,":[121],"while":[122],"ensuring":[123],"conversation":[124],"is":[126],"preserved":[127],"across":[128],"sessions.":[129],"implementation":[131],"demonstrates":[132],"practical":[134],"integration":[135],"of":[136],"in":[139],"interactive":[141],"artificial":[142],"intelligence":[143],"(AI)":[144],"system,":[145],"showcasing":[146],"its":[147],"potential":[148],"enhanced":[150],"engagement":[152],"interaction.":[154]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
