{"id":"https://openalex.org/W4304092154","doi":"https://doi.org/10.1145/3503161.3551610","title":"Leveraging Text Representation and Face-head Tracking for Long-form Multimodal Semantic Relation Understanding","display_name":"Leveraging Text Representation and Face-head Tracking for Long-form Multimodal Semantic Relation Understanding","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W4304092154","doi":"https://doi.org/10.1145/3503161.3551610"},"language":"en","primary_location":{"id":"doi:10.1145/3503161.3551610","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3551610","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5104006262","display_name":"Raksha Ramesh","orcid":null},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Raksha Ramesh","raw_affiliation_strings":["Columbia University &amp; Graphen Inc., New York, NY, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Columbia University &amp; Graphen Inc., New York, NY, USA","institution_ids":["https://openalex.org/I78577930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101715411","display_name":"Vishal Anand","orcid":"https://orcid.org/0000-0001-9632-6934"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]},{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vishal Anand","raw_affiliation_strings":["Columbia University &amp; Microsoft Corporation, Redmond, WA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Columbia University &amp; Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253","https://openalex.org/I78577930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019559271","display_name":"Zifan Chen","orcid":"https://orcid.org/0000-0002-1928-3755"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zifan Chen","raw_affiliation_strings":["Columbia University &amp; Graphen Inc., New York, NY, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Columbia University &amp; Graphen Inc., New York, NY, USA","institution_ids":["https://openalex.org/I78577930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090991667","display_name":"Yifei Dong","orcid":"https://orcid.org/0000-0002-7995-9657"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yifei Dong","raw_affiliation_strings":["Columbia University &amp; Graphen Inc., New York, NY, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Columbia University &amp; Graphen Inc., New York, NY, USA","institution_ids":["https://openalex.org/I78577930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100416519","display_name":"Yun Chen","orcid":"https://orcid.org/0000-0001-6917-7814"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yun Chen","raw_affiliation_strings":["Graphen Inc., New York, NY, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Graphen Inc., New York, NY, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5109855094","display_name":"Ching\u2010Yung Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ching-Yung Lin","raw_affiliation_strings":["Columbia University &amp; Graphen Inc., New York, NY, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Columbia University &amp; Graphen Inc., New York, NY, USA","institution_ids":["https://openalex.org/I78577930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.177,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.50912189,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"7215","last_page":"7219"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8213775157928467},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6375290155410767},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6346039175987244},{"id":"https://openalex.org/keywords/relation","display_name":"Relation (database)","score":0.5842651128768921},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5779585838317871},{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.5687400102615356},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5567060112953186},{"id":"https://openalex.org/keywords/head","display_name":"Head (geology)","score":0.5247581005096436},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.5041137933731079},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.45553871989250183},{"id":"https://openalex.org/keywords/tracking","display_name":"Tracking (education)","score":0.4520754814147949},{"id":"https://openalex.org/keywords/eye-tracking","display_name":"Eye tracking","score":0.45050787925720215},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.4472605884075165},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.42836955189704895},{"id":"https://openalex.org/keywords/knowledge-base","display_name":"Knowledge base","score":0.41820794343948364},{"id":"https://openalex.org/keywords/natural-language-understanding","display_name":"Natural language understanding","score":0.4131259024143219},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.3964800238609314},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.1570071578025818},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.09604394435882568}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8213775157928467},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6375290155410767},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6346039175987244},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.5842651128768921},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5779585838317871},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.5687400102615356},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5567060112953186},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.5247581005096436},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.5041137933731079},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.45553871989250183},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.4520754814147949},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.45050787925720215},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4472605884075165},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.42836955189704895},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.41820794343948364},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.4131259024143219},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3964800238609314},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.1570071578025818},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.09604394435882568},{"id":"https://openalex.org/C19417346","wikidata":"https://www.wikidata.org/wiki/Q7922","display_name":"Pedagogy","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C114793014","wikidata":"https://www.wikidata.org/wiki/Q52109","display_name":"Geomorphology","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3503161.3551610","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3551610","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W1556914148","https://openalex.org/W2108598243","https://openalex.org/W2277195237","https://openalex.org/W2493916176","https://openalex.org/W2969985801","https://openalex.org/W2970231061","https://openalex.org/W2981851019","https://openalex.org/W3021553284","https://openalex.org/W3035017890","https://openalex.org/W3093087807","https://openalex.org/W3095299523","https://openalex.org/W3105473141","https://openalex.org/W3113477099","https://openalex.org/W3207650505"],"related_works":["https://openalex.org/W2573309543","https://openalex.org/W2166773478","https://openalex.org/W2367925007","https://openalex.org/W2805834743","https://openalex.org/W4288263119","https://openalex.org/W3015724364","https://openalex.org/W2967994095","https://openalex.org/W4285240985","https://openalex.org/W2900126711","https://openalex.org/W4286930972"],"abstract_inverted_index":{"In":[0,18],"the":[1],"intricate":[2],"problem":[3],"of":[4,54,58],"understanding":[5,31,89],"long-form":[6],"multi-modal":[7],"inputs,":[8],"few":[9],"key-aspects":[10,25],"in":[11,35],"scene-understanding":[12],"and":[13,29,40,82,87],"dialogue-and-discourse":[14],"are":[15,73],"often":[16],"overlooked.":[17],"this":[19],"paper,":[20],"we":[21],"investigate":[22],"two":[23],"such":[24],"for":[26,49,85],"better":[27],"semantic":[28],"relational":[30],"-":[32],"(i).":[33],"head-object-tracking":[34],"addition":[36],"to":[37,52,67],"usual":[38],"face-tracking,":[39],"(ii).":[41],"fusing":[42],"scene-to-text":[43],"representation":[44],"with":[45,62],"external":[46],"common-sense":[47],"knowledge-base":[48],"effective":[50],"mapping":[51,66],"sub-tasks":[53],"interest.":[55],"The":[56],"usage":[57],"head-tracking":[59],"especially":[60],"helps":[61],"enriching":[63],"sparse":[64],"entity":[65],"inter-entity":[68],"conversation":[69],"interactions.":[70],"These":[71],"methods":[72],"guided":[74],"by":[75],"natural":[76],"language":[77],"supervision":[78],"on":[79],"visual":[80],"models,":[81],"perform":[83],"well":[84],"interaction":[86],"sentiment":[88],"tasks.":[90]},"counts_by_year":[{"year":2023,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
