{"id":"https://openalex.org/W3207721319","doi":"https://doi.org/10.1145/3474085.3479202","title":"A Multimodal Framework for Video Ads Understanding","display_name":"A Multimodal Framework for Video Ads Understanding","publication_year":2021,"publication_date":"2021-10-17","ids":{"openalex":"https://openalex.org/W3207721319","doi":"https://doi.org/10.1145/3474085.3479202","mag":"3207721319"},"language":"en","primary_location":{"id":"doi:10.1145/3474085.3479202","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3474085.3479202","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5075142476","display_name":"Zejia Weng","orcid":"https://orcid.org/0000-0001-9706-6484"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zejia Weng","raw_affiliation_strings":["Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045285009","display_name":"Lingchen Meng","orcid":null},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lingchen Meng","raw_affiliation_strings":["Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100431257","display_name":"Rui Wang","orcid":"https://orcid.org/0009-0003-8935-3119"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Wang","raw_affiliation_strings":["Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026167547","display_name":"Zuxuan Wu","orcid":"https://orcid.org/0000-0002-8689-5807"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zuxuan Wu","raw_affiliation_strings":["Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047962986","display_name":"Yu\u2013Gang Jiang","orcid":"https://orcid.org/0000-0002-1907-8567"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu-Gang Jiang","raw_affiliation_strings":["Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5075142476"],"corresponding_institution_ids":["https://openalex.org/I24943067"],"apc_list":null,"apc_paid":null,"fwci":0.0961,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.40681373,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"4843","last_page":"4847"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9958000183105469,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8461953401565552},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6149327158927917},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6016433835029602},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5339013338088989},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5258712768554688},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.5256896018981934},{"id":"https://openalex.org/keywords/structuring","display_name":"Structuring","score":0.46715301275253296},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.44948017597198486},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.36626559495925903},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.35731762647628784},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3510911464691162}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8461953401565552},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6149327158927917},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6016433835029602},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5339013338088989},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5258712768554688},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.5256896018981934},{"id":"https://openalex.org/C2775945657","wikidata":"https://www.wikidata.org/wiki/Q381442","display_name":"Structuring","level":2,"score":0.46715301275253296},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.44948017597198486},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.36626559495925903},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.35731762647628784},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3510911464691162},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C10138342","wikidata":"https://www.wikidata.org/wiki/Q43015","display_name":"Finance","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3474085.3479202","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3474085.3479202","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W2093367888","https://openalex.org/W2526479943","https://openalex.org/W2963351448","https://openalex.org/W2990503944","https://openalex.org/W3034364644","https://openalex.org/W3034658206"],"related_works":["https://openalex.org/W4394443292","https://openalex.org/W4251394462","https://openalex.org/W1580673008","https://openalex.org/W1510936208","https://openalex.org/W2486167009","https://openalex.org/W1595575899","https://openalex.org/W756498608","https://openalex.org/W2135201366","https://openalex.org/W4285395220","https://openalex.org/W2493576743"],"abstract_inverted_index":{"There":[0],"is":[1],"a":[2,36,76,121,143],"growing":[3],"trend":[4],"in":[5,148,157],"placing":[6],"video":[7,48,57],"advertisements":[8,24],"on":[9],"social":[10],"platforms":[11],"for":[12,80],"online":[13],"marketing,":[14],"which":[15],"demands":[16],"automatic":[17],"approaches":[18],"to":[19,39,83,89,125],"understand":[20],"the":[21,27,41,56,90,158],"contents":[22],"of":[23,43,46,145,150],"effectively.":[25],"Taking":[26],"2021":[28,159],"TAAC":[29,160],"competition":[30],"as":[31],"an":[32],"opportunity,":[33],"we":[34,53,73,96],"developed":[35],"multimodal":[37],"system":[38],"improve":[40],"ability":[42],"structured":[44],"analysis":[45,59],"advertising":[47],"content.":[49],"In":[50,70,93],"our":[51],"framework,":[52],"break":[54],"down":[55],"structuring":[58],"problem":[60],"into":[61],"two":[62],"tasks,":[63],"i.e.,":[64],"scene":[65,71],"segmentation":[66],"and":[67,134,152],"multi-modal":[68,94],"tagging.":[69],"segmentation,":[72],"build":[74],"upon":[75],"temporal":[77,81],"convolution":[78],"module":[79],"modeling":[82],"predict":[84],"whether":[85],"adjacent":[86],"frames":[87],"belong":[88],"same":[91],"scene.":[92],"tagging,":[95],"first":[97],"compute":[98],"clip-level":[99],"visual":[100,109],"features":[101,105,110,116],"by":[102],"aggregating":[103],"frame-level":[104],"with":[106,114],"NeXt-SoftDBoF.":[107],"The":[108],"are":[111,118],"further":[112],"complemented":[113],"textual":[115],"that":[117],"derived":[119],"using":[120],"global-local":[122],"attention":[123],"mechanism":[124],"extract":[126],"useful":[127],"information":[128],"from":[129],"OCR":[130],"(Optical":[131],"Character":[132],"Recognition)":[133,138],"ASR":[135],"(Audio":[136],"Speech":[137],"outputs.":[139],"Our":[140],"solution":[141],"achieved":[142],"score":[144],"0.2470":[146],"measured":[147],"consideration":[149],"localization":[151],"prediction":[153],"accuracy,":[154],"ranking":[155],"fourth":[156],"final":[161],"leaderboard.":[162]},"counts_by_year":[{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
