{"id":"https://openalex.org/W4415535684","doi":"https://doi.org/10.1145/3746027.3754699","title":"Query-Focused Multimodal Summarization with Gate-Guided Mixture-of-Experts","display_name":"Query-Focused Multimodal Summarization with Gate-Guided Mixture-of-Experts","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415535684","doi":"https://doi.org/10.1145/3746027.3754699"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3754699","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3754699","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746027.3754699","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Jiajun Han","orcid":"https://orcid.org/0009-0002-8171-906X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiajun Han","raw_affiliation_strings":["School of Software, BNRist, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-8171-906X","affiliations":[{"raw_affiliation_string":"School of Software, BNRist, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102567832","display_name":"Xuran Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuran Yang","raw_affiliation_strings":["School of Software, BNRist, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-5606-9797","affiliations":[{"raw_affiliation_string":"School of Software, BNRist, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100715419","display_name":"Hui Zhang","orcid":"https://orcid.org/0000-0001-6563-9890"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hui Zhang","raw_affiliation_strings":["School of Software, BNRist, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-6563-9890","affiliations":[{"raw_affiliation_string":"School of Software, BNRist, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.9349,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.80216016,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"6471","last_page":"6480"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.9492999911308289},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.6028000116348267},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5429999828338623},{"id":"https://openalex.org/keywords/multi-document-summarization","display_name":"Multi-document summarization","score":0.45669999718666077},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.3828999996185303},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.27570000290870667}],"concepts":[{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.9492999911308289},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8659999966621399},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.6028000116348267},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5622000098228455},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5429999828338623},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4959999918937683},{"id":"https://openalex.org/C134714966","wikidata":"https://www.wikidata.org/wiki/Q6934448","display_name":"Multi-document summarization","level":3,"score":0.45669999718666077},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.41429999470710754},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3828999996185303},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3089999854564667},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.26339998841285706},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.260699987411499}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3754699","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3754699","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3746027.3754699","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3754699","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W1979354511","https://openalex.org/W2097117768","https://openalex.org/W2194775991","https://openalex.org/W2759570332","https://openalex.org/W2781922022","https://openalex.org/W2798970487","https://openalex.org/W2809290718","https://openalex.org/W2890721473","https://openalex.org/W2902616437","https://openalex.org/W2949749357","https://openalex.org/W2952138241","https://openalex.org/W2963351448","https://openalex.org/W2963919999","https://openalex.org/W2963929190","https://openalex.org/W2967219836","https://openalex.org/W2982672255","https://openalex.org/W2996811227","https://openalex.org/W3016201367","https://openalex.org/W3107128832","https://openalex.org/W3126245673","https://openalex.org/W4312632488","https://openalex.org/W4386065705"],"related_works":[],"abstract_inverted_index":{"The":[0],"goal":[1],"of":[2,22,38,177],"generic":[3,92],"multimodal":[4,59,95,132,166],"summarization":[5,60,153,159,167],"is":[6,29,49],"to":[7,16,46,51,116,127,142],"extract":[8],"the":[9,20,36,40,101,129,175],"most":[10],"important":[11,50],"information":[12],"from":[13],"different":[14],"modalities":[15],"form":[17],"summaries.":[18],"Yet":[19],"importance":[21],"scenes":[23],"and":[24,32,93,124,145,162,164,172],"text":[25,122],"in":[26,100],"a":[27,86,107,150],"video":[28,152,158],"often":[30],"subjective,":[31],"users":[33],"should":[34],"have":[35,61],"option":[37],"customizing":[39],"summary":[41],"by":[42],"using":[43],"natural":[44],"language":[45,65],"specify":[47],"what":[48],"them.":[52],"However,":[53],"existing":[54],"methods":[55],"for":[56,74,89],"fully":[57],"automatic":[58],"not":[62],"exploited":[63],"available":[64],"models,":[66],"which":[67],"can":[68],"serve":[69],"as":[70],"an":[71],"effective":[72],"prior":[73],"saliency.":[75],"To":[76],"address":[77],"this":[78],"issue,":[79],"we":[80,105,136],"introduce":[81],"Query-Focused":[82],"Multimodal":[83],"Summ":[84],"arization(QFSumm),":[85],"single":[87],"framework":[88],"addressing":[90],"both":[91],"query-focused":[94,151],"summarization,":[96],"typically":[97],"approached":[98],"separately":[99],"literature.":[102],"In":[103,134],"addition,":[104,135],"propose":[106,137],"novel":[108,139],"gate-guided":[109],"mixture-of-experts":[110],"that":[111],"uses":[112],"expert":[113,123],"gate":[114],"module":[115],"organize":[117],"three":[118,165],"experts":[119],"(video":[120],"expert,":[121],"shared":[125],"expert)":[126],"model":[128],"correlations":[130],"between":[131],"information.":[133],"two":[138,156],"contrastive":[140],"losses":[141],"represent":[143],"consistency":[144],"diversity.":[146],"Extensive":[147],"experiments":[148],"on":[149,182],"dataset":[154],"(QFVS),":[155],"standard":[157],"datasets":[160,168],"(TVSum":[161],"SumMe)":[163],"(CNN,":[169],"Daily":[170],"Mail":[171],"BLiSS)":[173],"demonstrate":[174],"superiority":[176],"QFSumm,":[178],"achieving":[179],"state-of-the-art":[180],"performances":[181],"all":[183],"datasets.":[184]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-25T00:00:00"}
