{"id":"https://openalex.org/W3169721101","doi":"https://doi.org/10.1109/icme51207.2021.9428201","title":"Combine Early and Late Fusion Together: A Hybrid Fusion Framework for Image-Text Matching","display_name":"Combine Early and Late Fusion Together: A Hybrid Fusion Framework for Image-Text Matching","publication_year":2021,"publication_date":"2021-06-09","ids":{"openalex":"https://openalex.org/W3169721101","doi":"https://doi.org/10.1109/icme51207.2021.9428201","mag":"3169721101"},"language":"en","primary_location":{"id":"doi:10.1109/icme51207.2021.9428201","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme51207.2021.9428201","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100398570","display_name":"Yifan Wang","orcid":"https://orcid.org/0000-0002-6719-5063"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yifan Wang","raw_affiliation_strings":["University of Electronic Science and Technology of China,Center for Future Media &#x0026; School of Computer Science and Engineering,China"],"affiliations":[{"raw_affiliation_string":"University of Electronic Science and Technology of China,Center for Future Media &#x0026; School of Computer Science and Engineering,China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009074046","display_name":"Xing Xu","orcid":"https://orcid.org/0000-0001-5685-3123"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xing Xu","raw_affiliation_strings":["University of Electronic Science and Technology of China,Center for Future Media &#x0026; School of Computer Science and Engineering,China"],"affiliations":[{"raw_affiliation_string":"University of Electronic Science and Technology of China,Center for Future Media &#x0026; School of Computer Science and Engineering,China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100901963","display_name":"Wei Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Yu","raw_affiliation_strings":["University of Electronic Science and Technology of China,Center for Future Media &#x0026; School of Computer Science and Engineering,China","Glasgow College, University of Electronic Science and Technology of China, China"],"affiliations":[{"raw_affiliation_string":"University of Electronic Science and Technology of China,Center for Future Media &#x0026; School of Computer Science and Engineering,China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"Glasgow College, University of Electronic Science and Technology of China, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052701049","display_name":"Ruicong Xu","orcid":"https://orcid.org/0000-0002-3463-8606"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ruicong Xu","raw_affiliation_strings":["Meituan"],"affiliations":[{"raw_affiliation_string":"Meituan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020177102","display_name":"Zuo Cao","orcid":"https://orcid.org/0000-0001-9446-5105"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zuo Cao","raw_affiliation_strings":["Meituan"],"affiliations":[{"raw_affiliation_string":"Meituan","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052993469","display_name":"Heng Tao Shen","orcid":"https://orcid.org/0000-0002-2999-2088"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Heng Tao Shen","raw_affiliation_strings":["University of Electronic Science and Technology of China,Center for Future Media &#x0026; School of Computer Science and Engineering,China"],"affiliations":[{"raw_affiliation_string":"University of Electronic Science and Technology of China,Center for Future Media &#x0026; School of Computer Science and Engineering,China","institution_ids":["https://openalex.org/I150229711"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100398570"],"corresponding_institution_ids":["https://openalex.org/I150229711"],"apc_list":null,"apc_paid":null,"fwci":0.8646,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.75310458,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7531450986862183},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.6735295653343201},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6402332782745361},{"id":"https://openalex.org/keywords/image-fusion","display_name":"Image fusion","score":0.6211426258087158},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5962510704994202},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5730020999908447},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.5660524964332581},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.5635079145431519},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5586040019989014},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5422258377075195},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.49392277002334595},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.46767956018447876},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4468718469142914},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4425082802772522},{"id":"https://openalex.org/keywords/sensor-fusion","display_name":"Sensor fusion","score":0.4220290780067444},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.34856003522872925},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.32377946376800537},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.32041603326797485},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.07528045773506165},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.06521016359329224}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7531450986862183},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.6735295653343201},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6402332782745361},{"id":"https://openalex.org/C69744172","wikidata":"https://www.wikidata.org/wiki/Q860822","display_name":"Image fusion","level":3,"score":0.6211426258087158},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5962510704994202},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5730020999908447},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.5660524964332581},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.5635079145431519},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5586040019989014},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5422258377075195},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.49392277002334595},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.46767956018447876},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4468718469142914},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4425082802772522},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.4220290780067444},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34856003522872925},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32377946376800537},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.32041603326797485},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.07528045773506165},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.06521016359329224},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme51207.2021.9428201","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme51207.2021.9428201","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.4300000071525574,"display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2185175083","https://openalex.org/W2606473278","https://openalex.org/W2774267535","https://openalex.org/W2890531016","https://openalex.org/W2949197413","https://openalex.org/W2962856082","https://openalex.org/W2962964995","https://openalex.org/W2964120214","https://openalex.org/W2970869018","https://openalex.org/W2981448908","https://openalex.org/W2981694290","https://openalex.org/W2982461533","https://openalex.org/W2988823324","https://openalex.org/W2994818707","https://openalex.org/W3005971801","https://openalex.org/W3010277541","https://openalex.org/W3014958591","https://openalex.org/W3035454331","https://openalex.org/W3035605030","https://openalex.org/W6639102338","https://openalex.org/W6728881024","https://openalex.org/W6747225742","https://openalex.org/W6749537441","https://openalex.org/W6754778999"],"related_works":["https://openalex.org/W73545470","https://openalex.org/W4224266612","https://openalex.org/W2383394264","https://openalex.org/W4320153225","https://openalex.org/W4293261942","https://openalex.org/W3125968744","https://openalex.org/W2004831463","https://openalex.org/W2167701463","https://openalex.org/W2110287964","https://openalex.org/W2168054807"],"abstract_inverted_index":{"Image-text":[0],"matching":[1,172],"is":[2,64,178],"a":[3,65],"challenging":[4],"task":[5],"in":[6,118],"cross-modal":[7,36],"learning":[8],"due":[9],"to":[10,30,41,55,127],"the":[11,27,74,78,83,87,98,120,134,143,148,171],"discrepancy":[12],"of":[13,19,77,112,174],"data":[14],"representation":[15],"be-tween":[16],"different":[17],"modalities":[18,104],"images":[20],"and":[21,38,58,93,102,122,151,167],"texts.":[22],"The":[23,158],"main-stream":[24],"methods":[25],"adopt":[26],"late":[28,79,94,144],"fusion":[29,68,80,92,95,113,145],"generate":[31],"image-text":[32,131,164],"similarity":[33],"on":[34,142,160],"encoded":[35],"features,":[37],"put":[39],"effort":[40],"capture":[42],"intra-modality":[43],"associations":[44],"with":[45,90,154,181],"considerably":[46],"high":[47],"training":[48,184],"cost.":[49],"In":[50,82,133],"this":[51],"work,":[52],"we":[53,136],"propose":[54],"Combine":[56],"Early":[57],"Late":[59],"Fusion":[60],"Together":[61],"(CELFT),":[62],"which":[63,125],"universal":[66],"hybrid":[67,88],"framework":[69],"that":[70,170],"can":[71],"effectively":[72],"overcome":[73],"above":[75],"shortcomings":[76],"scheme.":[81],"pro-posed":[84],"CELFT":[85,156],"framework,":[86],"structure":[89],"early":[91,106],"could":[96],"facilitate":[97],"interaction":[99],"between":[100],"image":[101],"text":[103],"at":[105],"stage.":[107],"Moreover,":[108],"these":[109],"two":[110,161],"kinds":[111],"strategies":[114],"complement":[115],"each":[116],"other":[117],"capturing":[119],"inter-modal":[121],"intra-modal":[123],"information,":[124],"ensure":[126],"learn":[128],"more":[129],"accurate":[130],"similarity.":[132],"experiments,":[135],"choose":[137],"four":[138],"latest":[139],"approaches":[140],"based":[141],"scheme":[146],"as":[147],"base":[149,176],"models,":[150],"integrate":[152],"them":[153],"our":[155],"framework.":[157],"results":[159],"widely":[162],"used":[163],"datasets":[165],"MSCOCO":[166],"Flickr30K":[168],"show":[169],"performance":[173],"all":[175],"models":[177],"significantly":[179],"improved":[180],"remarkably":[182],"reduced":[183],"time.":[185]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
