{"id":"https://openalex.org/W4380032287","doi":"https://doi.org/10.1109/taslp.2023.3284511","title":"DialogMCF: Multimodal Context Flow for Audio Visual Scene-Aware Dialog","display_name":"DialogMCF: Multimodal Context Flow for Audio Visual Scene-Aware Dialog","publication_year":2023,"publication_date":"2023-06-09","ids":{"openalex":"https://openalex.org/W4380032287","doi":"https://doi.org/10.1109/taslp.2023.3284511"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2023.3284511","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3284511","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100457773","display_name":"Zhe Chen","orcid":"https://orcid.org/0000-0003-4599-1170"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]},{"id":"https://openalex.org/I4210099312","display_name":"Shandong Jiaotong University","ror":"https://ror.org/01848hk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210099312"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhe Chen","raw_affiliation_strings":["Cooperative Medianet Innovation Center, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Cooperative Medianet Innovation Center, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I4210099312","https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100778701","display_name":"Hongcheng Liu","orcid":"https://orcid.org/0009-0000-3271-9544"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]},{"id":"https://openalex.org/I4210099312","display_name":"Shandong Jiaotong University","ror":"https://ror.org/01848hk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210099312"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongcheng Liu","raw_affiliation_strings":["Cooperative Medianet Innovation Center, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Cooperative Medianet Innovation Center, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I4210099312","https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100445125","display_name":"Yu Wang","orcid":"https://orcid.org/0000-0001-9500-081X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]},{"id":"https://openalex.org/I4210099312","display_name":"Shandong Jiaotong University","ror":"https://ror.org/01848hk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210099312"]},{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu Wang","raw_affiliation_strings":["Cooperative Medianet Innovation Center, Shanghai Jiao Tong University, Shanghai, China","Shanghai Artificial Intelligence Laboratory, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Cooperative Medianet Innovation Center, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I4210099312","https://openalex.org/I183067930"]},{"raw_affiliation_string":"Shanghai Artificial Intelligence Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4391012619"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100457773"],"corresponding_institution_ids":["https://openalex.org/I183067930","https://openalex.org/I4210099312"],"apc_list":null,"apc_paid":null,"fwci":0.9629,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.77089654,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":"32","issue":null,"first_page":"753","last_page":"764"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8395849466323853},{"id":"https://openalex.org/keywords/dialog-box","display_name":"Dialog box","score":0.7793034315109253},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6434402465820312},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6364759802818298},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.585715651512146},{"id":"https://openalex.org/keywords/multimodality","display_name":"Multimodality","score":0.5361810326576233},{"id":"https://openalex.org/keywords/information-flow","display_name":"Information flow","score":0.533110499382019},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4691314399242401},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.43964749574661255},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.42155322432518005},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.4072965681552887},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.39543306827545166},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.3436615467071533},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.13192999362945557},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.10613930225372314}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8395849466323853},{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.7793034315109253},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6434402465820312},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6364759802818298},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.585715651512146},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.5361810326576233},{"id":"https://openalex.org/C2779136372","wikidata":"https://www.wikidata.org/wiki/Q10283002","display_name":"Information flow","level":2,"score":0.533110499382019},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4691314399242401},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.43964749574661255},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.42155322432518005},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4072965681552887},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39543306827545166},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3436615467071533},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.13192999362945557},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.10613930225372314},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2023.3284511","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3284511","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/17","score":0.44999998807907104,"display_name":"Partnerships for the goals"}],"awards":[{"id":"https://openalex.org/G4532141107","display_name":null,"funder_award_id":"21511101100","funder_id":"https://openalex.org/F4320321885","funder_display_name":"Science and Technology Commission of Shanghai Municipality"},{"id":"https://openalex.org/G5801087733","display_name":null,"funder_award_id":"STCSM 22DZ2229005","funder_id":"https://openalex.org/F4320327718","funder_display_name":"Shanghai Key Laboratory of Digital Media Processing and Transmission"}],"funders":[{"id":"https://openalex.org/F4320321885","display_name":"Science and Technology Commission of Shanghai Municipality","ror":"https://ror.org/03kt66j61"},{"id":"https://openalex.org/F4320327718","display_name":"Shanghai Key Laboratory of Digital Media Processing and Transmission","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":62,"referenced_works":["https://openalex.org/W1956340063","https://openalex.org/W2064675550","https://openalex.org/W2101105183","https://openalex.org/W2108598243","https://openalex.org/W2194775991","https://openalex.org/W2337252826","https://openalex.org/W2526050071","https://openalex.org/W2603266952","https://openalex.org/W2612690371","https://openalex.org/W2810643877","https://openalex.org/W2892245540","https://openalex.org/W2930283066","https://openalex.org/W2950697717","https://openalex.org/W2952592807","https://openalex.org/W2962883855","https://openalex.org/W2963524571","https://openalex.org/W2963789888","https://openalex.org/W2964213933","https://openalex.org/W2964487155","https://openalex.org/W2972745026","https://openalex.org/W2979826702","https://openalex.org/W2988937804","https://openalex.org/W3000366420","https://openalex.org/W3000477803","https://openalex.org/W3015371781","https://openalex.org/W3017272398","https://openalex.org/W3034291519","https://openalex.org/W3102037269","https://openalex.org/W3109558947","https://openalex.org/W3134294468","https://openalex.org/W3137384391","https://openalex.org/W3137797626","https://openalex.org/W3162479984","https://openalex.org/W3176362845","https://openalex.org/W3177219653","https://openalex.org/W3192484213","https://openalex.org/W3205782068","https://openalex.org/W3207072950","https://openalex.org/W4221141684","https://openalex.org/W4229019825","https://openalex.org/W4249013746","https://openalex.org/W4284898017","https://openalex.org/W4287887552","https://openalex.org/W4394659899","https://openalex.org/W6638318767","https://openalex.org/W6638523607","https://openalex.org/W6678262379","https://openalex.org/W6679844565","https://openalex.org/W6682631176","https://openalex.org/W6732742072","https://openalex.org/W6739901393","https://openalex.org/W6757670774","https://openalex.org/W6757817989","https://openalex.org/W6757864324","https://openalex.org/W6766893790","https://openalex.org/W6773596017","https://openalex.org/W6775943838","https://openalex.org/W6784333009","https://openalex.org/W6791353385","https://openalex.org/W6802620464","https://openalex.org/W6810164656","https://openalex.org/W6864544085"],"related_works":["https://openalex.org/W2013013717","https://openalex.org/W66256988","https://openalex.org/W2068964259","https://openalex.org/W1987340722","https://openalex.org/W2791907740","https://openalex.org/W2739856905","https://openalex.org/W4239305747","https://openalex.org/W199123384","https://openalex.org/W4205486898","https://openalex.org/W2978844158"],"abstract_inverted_index":{"In":[0],"recent":[1,60],"years,":[2],"Audio":[3],"Visual":[4],"Scene-Aware":[5],"Dialog":[6,28],"(AVSD)":[7],"has":[8,20],"been":[9,22],"an":[10,36,153,161],"active":[11],"research":[12],"task":[13,34],"in":[14,62,72,106],"the":[15,27,63,80,96,139,142,178,187,190,199,208,221,235],"multimodal":[16,52,81,119,157,172],"dialogue":[17,57,149,173,227],"community":[18],"and":[19,89,175,197],"also":[21],"a":[23,75,111,118,216],"core":[24],"part":[25],"of":[26,38,84,87,98,141,148,156,189,201,218],"System":[29],"Technology":[30],"Challenge":[31],"(DSTC).":[32],"This":[33,132],"is":[35],"extension":[37],"conventional":[39],"visual":[40],"question":[41],"answering,":[42],"where":[43],"video-relevant":[44],"answers":[45],"must":[46],"be":[47],"generated":[48],"taking":[49],"into":[50,95],"account":[51],"contextual":[53,82],"information":[54,83,94,144],"from":[55],"previous":[56],"rounds.":[58],"Despite":[59],"advances":[61],"AVSD":[64,210],"task,":[65],"there":[66],"are":[67,127],"still":[68],"two":[69,104],"major":[70],"challenges":[71],"developing":[73],"such":[74],"system:":[76],"how":[77,90],"to":[78,91,123,129,169,185,215],"model":[79,122,170,193],"multiple":[85,146],"rounds":[86,147],"dialogues":[88],"integrate":[92],"audio-visual":[93,162],"generation":[97,228],"textual":[99],"responses.":[100],"To":[101,151],"tackle":[102],"these":[103],"challenges,":[105],"this":[107],"paper":[108],"we":[109,159,182],"propose":[110,160],"novel":[112],"model,":[113],"named":[114],"DialogMCF,":[115],"which":[116],"constructs":[117],"context":[120,134],"flow":[121,135,179],"generate":[124],"responses":[125],"that":[126],"relevant":[128],"video":[130,236],"scenes.":[131],"proposed":[133,191,222],"modeling":[136],"can":[137,224],"track":[138],"dynamics":[140],"topic":[143],"across":[145],"history.":[150],"achieve":[152],"effective":[154],"fusion":[155],"information,":[158],"memory":[163],"network":[164],"with":[165,194],"cross-modality":[166],"aligned":[167],"features":[168],"long":[171],"context,":[174],"thus":[176],"enhance":[177],"modeling.":[180],"Furthermore,":[181],"make":[183],"attempts":[184],"improve":[186],"performance":[188,229],"DialogMCF":[192],"manual":[195],"descriptions":[196],"explore":[198],"incorporation":[200],"temporal":[202],"reasoning":[203],"information.":[204],"Extensive":[205],"experiments":[206],"on":[207,230],"DSTC":[209],"datasets":[211],"show":[212],"that,":[213],"compared":[214],"range":[217],"baseline":[219],"methods,":[220],"method":[223],"yield":[225],"state-of-art":[226],"most":[231],"metrics":[232],"when":[233],"integrating":[234],"descriptions.":[237]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":4}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
