{"id":"https://openalex.org/W4415540526","doi":"https://doi.org/10.1145/3746027.3755204","title":"Optimal Feature Embedding for Document Large Visual Language Model","display_name":"Optimal Feature Embedding for Document Large Visual Language Model","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415540526","doi":"https://doi.org/10.1145/3746027.3755204"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755204","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755204","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5075129077","display_name":"Fan Yang","orcid":"https://orcid.org/0000-0001-5821-021X"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fan Yang","raw_affiliation_strings":["School of Electronic and Information Engineering, South China University of Technology, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-5821-021X","affiliations":[{"raw_affiliation_string":"School of Electronic and Information Engineering, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101130399","display_name":"Ling Deng","orcid":"https://orcid.org/0009-0002-5607-8192"},"institutions":[{"id":"https://openalex.org/I6507939","display_name":"China United Network Communications Group (China)","ror":"https://ror.org/028w99c90","country_code":"CN","type":"company","lineage":["https://openalex.org/I6507939"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ling Deng","raw_affiliation_strings":["China United Network Communications Corporation Limited Guangdong Branch, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0002-5607-8192","affiliations":[{"raw_affiliation_string":"China United Network Communications Corporation Limited Guangdong Branch, Guangzhou, China","institution_ids":["https://openalex.org/I6507939"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056718604","display_name":"Zhiyong Gan","orcid":"https://orcid.org/0009-0008-6451-1068"},"institutions":[{"id":"https://openalex.org/I6507939","display_name":"China United Network Communications Group (China)","ror":"https://ror.org/028w99c90","country_code":"CN","type":"company","lineage":["https://openalex.org/I6507939"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiyong Gan","raw_affiliation_strings":["China United Network Communications Corporation Limited Guangdong Branch, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0008-6451-1068","affiliations":[{"raw_affiliation_string":"China United Network Communications Corporation Limited Guangdong Branch, Guangzhou, China","institution_ids":["https://openalex.org/I6507939"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Qisheng He","orcid":"https://orcid.org/0009-0002-6535-0100"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qisheng He","raw_affiliation_strings":["South China University of Technology, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0002-6535-0100","affiliations":[{"raw_affiliation_string":"South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053328795","display_name":"Yuanbo Fang","orcid":"https://orcid.org/0000-0002-3830-0178"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuanbo Fang","raw_affiliation_strings":["South China University of Technology, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-3830-0178","affiliations":[{"raw_affiliation_string":"South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007354180","display_name":"Xiangmin Xu","orcid":"https://orcid.org/0000-0003-4573-5820"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiangmin Xu","raw_affiliation_strings":["South China University of Technology, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-4573-5820","affiliations":[{"raw_affiliation_string":"South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101528918","display_name":"Shuangping Huang","orcid":"https://orcid.org/0000-0002-5544-4544"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuangping Huang","raw_affiliation_strings":["South China University of Technology, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-5544-4544","affiliations":[{"raw_affiliation_string":"South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052027147","display_name":"Tianshui Chen","orcid":"https://orcid.org/0000-0002-5848-5624"},"institutions":[{"id":"https://openalex.org/I139024713","display_name":"Guangdong University of Technology","ror":"https://ror.org/04azbjn80","country_code":"CN","type":"education","lineage":["https://openalex.org/I139024713"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianshui Chen","raw_affiliation_strings":["Guangdong University of Technology, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-5848-5624","affiliations":[{"raw_affiliation_string":"Guangdong University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I139024713"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.9349,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.80324616,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"3817","last_page":"3826"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.7986999750137329},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.7250999808311462},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7116000056266785},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5734999775886536},{"id":"https://openalex.org/keywords/hierarchy","display_name":"Hierarchy","score":0.4918000102043152},{"id":"https://openalex.org/keywords/position","display_name":"Position (finance)","score":0.4327999949455261},{"id":"https://openalex.org/keywords/feature-model","display_name":"Feature model","score":0.42750000953674316},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.41589999198913574}],"concepts":[{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.7986999750137329},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.737500011920929},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.7250999808311462},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7116000056266785},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5910000205039978},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5734999775886536},{"id":"https://openalex.org/C31170391","wikidata":"https://www.wikidata.org/wiki/Q188619","display_name":"Hierarchy","level":2,"score":0.4918000102043152},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.4327999949455261},{"id":"https://openalex.org/C101814296","wikidata":"https://www.wikidata.org/wiki/Q5439685","display_name":"Feature model","level":3,"score":0.42750000953674316},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.41589999198913574},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3808000087738037},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3555000126361847},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3513999879360199},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3504999876022339},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.33329999446868896},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.33000001311302185},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.32910001277923584},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C83248878","wikidata":"https://www.wikidata.org/wiki/Q344000","display_name":"Active appearance model","level":3,"score":0.2913999855518341},{"id":"https://openalex.org/C2780148112","wikidata":"https://www.wikidata.org/wiki/Q1432581","display_name":"Proxy (statistics)","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26489999890327454},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26190000772476196},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.25609999895095825},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2558000087738037},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2515999972820282},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2513999938964844}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755204","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755204","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W240417119","https://openalex.org/W1966382373","https://openalex.org/W1985495176","https://openalex.org/W2194775991","https://openalex.org/W3090449556","https://openalex.org/W3107064625","https://openalex.org/W3173220247","https://openalex.org/W4225829029","https://openalex.org/W4382240022","https://openalex.org/W4383604278","https://openalex.org/W4386066115","https://openalex.org/W4394337593","https://openalex.org/W4400915096","https://openalex.org/W4403585670"],"related_works":[],"abstract_inverted_index":{"Document":[0],"Large":[1],"Vision":[2,33],"Language":[3,34],"Models":[4],"excel":[5],"in":[6,65,79],"document-centric":[7,190],"tasks":[8],"and":[9,139],"have":[10],"become":[11],"a":[12,22,31,40,66,80,111,130],"key":[13],"focus":[14],"of":[15,30,47,113,122],"research.":[16],"Existing":[17],"frameworks":[18],"embed":[19],"features":[20],"from":[21,73],"lightweight,":[23],"document-specific":[24],"encoder":[25,75],"into":[26],"the":[27,52,62,71,74,92,96,104,107,115,120,123,173,177,180],"first":[28],"layer":[29,90],"general-purpose":[32],"Model":[35],"(VLM).":[36],"However,":[37],"this":[38],"introduces":[39],"feature":[41,53,64,72],"mismatch":[42],"problem.":[43],"VLMs":[44],"typically":[45],"consist":[46],"many":[48],"stacked":[49],"layers,":[50],"with":[51,172,179],"hierarchy":[54],"becoming":[55],"increasingly":[56],"abstract":[57],"at":[58],"higher":[59],"layers.":[60],"Specifically,":[61],"first-layer":[63],"VLM":[67,93],"is":[68,76,84],"token-level,":[69],"whereas":[70],"task-level,":[77],"resulting":[78],"mismatch.":[81],"Consequently,":[82],"it":[83],"crucial":[85],"to":[86,157],"identify":[87],"an":[88],"optimal":[89,108,181],"within":[91],"for":[94,106],"embedding":[95,109,167,182],"encoder's":[97],"features.":[98],"Inspired":[99],"by":[100,154],"physics,":[101],"we":[102,127],"reformulate":[103],"search":[105,152],"as":[110],"problem":[112],"finding":[114],"shortest":[116,124],"time":[117,125,153],"curve.":[118],"Leveraging":[119],"properties":[121],"curve,":[126],"theoretically":[128],"derive":[129],"task-agnostic":[131],"proxy":[132],"score":[133],"that":[134,149,164,169],"requires":[135],"only":[136],"partial":[137],"training":[138],"propose":[140],"our":[141],"searching":[142],"framework,":[143],"Brac4VLM.":[144],"Our":[145],"theoretical":[146],"derivation":[147],"shows":[148],"Brac4VLM":[150,165],"reduces":[151],"97.8%":[155],"compared":[156],"brute-force":[158],"methods.":[159],"Experimental":[160],"results":[161],"further":[162],"demonstrate":[163],"identifies":[166],"points":[168],"closely":[170],"align":[171],"true":[174],"optima.":[175],"Moreover,":[176],"DocVLM":[178],"position":[183],"identified":[184],"achieves":[185],"state-of-the-art":[186],"performance":[187],"across":[188],"various":[189],"tasks.":[191],"Codes:":[192],"https://github.com/MaxKinny/Brac4VLM.":[193]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-25T00:00:00"}
