{"id":"https://openalex.org/W3201709741","doi":"https://doi.org/10.17615/dk8k-sc87","title":"Pre-Training Methods for Vision and Language","display_name":"Pre-Training Methods for Vision and Language","publication_year":2021,"publication_date":"2021-08-18","ids":{"openalex":"https://openalex.org/W3201709741","doi":"https://doi.org/10.17615/dk8k-sc87","mag":"3201709741"},"language":"en","primary_location":{"id":"mag:3201709741","is_oa":false,"landing_page_url":"https://cdr.lib.unc.edu/concern/dissertations/bg257q50z","pdf_url":null,"source":null,"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":null},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.17615/dk8k-sc87","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103017806","display_name":"Hao Tan","orcid":"https://orcid.org/0009-0001-1201-6136"},"institutions":[{"id":"https://openalex.org/I114027177","display_name":"University of North Carolina at Chapel Hill","ror":"https://ror.org/0130frc33","country_code":"US","type":"education","lineage":["https://openalex.org/I114027177"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Tan, Hao","raw_affiliation_strings":["University of North Carolina at Chapel Hill"],"affiliations":[{"raw_affiliation_string":"University of North Carolina at Chapel Hill","institution_ids":["https://openalex.org/I114027177"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5103017806"],"corresponding_institution_ids":["https://openalex.org/I114027177"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.13818302,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13175","display_name":"Historical Astronomy and Related Studies","score":0.0674000009894371,"subfield":{"id":"https://openalex.org/subfields/3103","display_name":"Astronomy and Astrophysics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13175","display_name":"Historical Astronomy and Related Studies","score":0.0674000009894371,"subfield":{"id":"https://openalex.org/subfields/3103","display_name":"Astronomy and Astrophysics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.06319999694824219,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.060499999672174454,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6292921900749207},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5125434398651123},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.49321240186691284},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.35939645767211914},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3477921187877655},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.3460195064544678},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.1566486358642578},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.15624502301216125}],"concepts":[{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6292921900749207},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5125434398651123},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.49321240186691284},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.35939645767211914},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3477921187877655},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.3460195064544678},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.1566486358642578},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.15624502301216125},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"mag:3201709741","is_oa":false,"landing_page_url":"https://cdr.lib.unc.edu/concern/dissertations/bg257q50z","pdf_url":null,"source":null,"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":null},{"id":"doi:10.17615/dk8k-sc87","is_oa":true,"landing_page_url":"https://doi.org/10.17615/dk8k-sc87","pdf_url":null,"source":{"id":"https://openalex.org/S7407051488","display_name":"UNC Libraries","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"thesis"}],"best_oa_location":{"id":"doi:10.17615/dk8k-sc87","is_oa":true,"landing_page_url":"https://doi.org/10.17615/dk8k-sc87","pdf_url":null,"source":{"id":"https://openalex.org/S7407051488","display_name":"UNC Libraries","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"thesis"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5899999737739563}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2887026948","https://openalex.org/W2411015974","https://openalex.org/W1998693564","https://openalex.org/W339753493","https://openalex.org/W2959162379","https://openalex.org/W148144599","https://openalex.org/W2063703966","https://openalex.org/W176787048","https://openalex.org/W231952954","https://openalex.org/W118869088","https://openalex.org/W2820755663","https://openalex.org/W2906035574","https://openalex.org/W1543735777","https://openalex.org/W2962997514","https://openalex.org/W2087364748","https://openalex.org/W314294914","https://openalex.org/W205871521","https://openalex.org/W3156778636","https://openalex.org/W3119760525","https://openalex.org/W2085236321"],"abstract_inverted_index":{"Vision":[0],"and":[1,11,24,42,79,85,103,134,147,170,204,215,241],"language":[2,84,104,164,202,213],"are":[3,30,59,207],"the":[4,40,55,65,77,88,96,153,179,198,232],"primary":[5],"modalities":[6],"of":[7,19,57,90,155],"our":[8],"human":[9],"perception":[10],"learning.":[12,217],"Recent":[13],"years":[14],"have":[15,64],"witnessed":[16],"fast":[17],"development":[18],"methods":[20,29,99,222],"that":[21,53,75,231],"connect":[22],"vision":[23,86,101,205],"language.":[25],"Current":[26],"deep":[27],"learning":[28],"data-hungry,":[31],"thus":[32],"pre-training":[33,51,102,118,122,157,203,206,226],"on":[34,47,138,228],"large-scale":[35],"data":[36,130],"helps":[37],"warm":[38],"up":[39],"model":[41,234],"shows":[43],"better":[44],"fine-tuning":[45],"results":[46,137],"downstream":[48],"tasks.":[49,160,176],"However,":[50],"frameworks":[52],"exploit":[54],"power":[56],"multi-modality":[58],"still":[60],"underexplored.":[61],"Specifically,":[62],"we":[63,70,94],"following":[66],"questions":[67],"remaining:":[68],"Could":[69,83,93],"build":[71,115],"large":[72],"pre-trained":[73,233],"models":[74],"understand":[76],"interactions":[78],"alignments":[80],"between":[81],"modalities?":[82],"help":[87],"understanding":[89,165],"each":[91,194],"other?":[92],"combine":[95,219],"current":[97,201],"diverse":[98],"for":[100,193],"pre-training?":[105],"This":[106,121],"dissertation":[107],"aims":[108],"to":[109,174,182,189],"answer":[110],"these":[111,220],"questions.":[112],"I":[113,162,177,218],"first":[114],"a":[116,224],"vision-and-language":[117,125,159],"framework:":[119],"LXMERT.":[120],"framework":[123,227],"learns":[124,188],"joint":[126],"representations":[127],"from":[128],"massive":[129],"(e.g.,":[131],"MS":[132],"COCO)":[133],"achieves":[135],"state-of-the-art":[136],"several":[139],"benchmark":[140],"tasks":[141],"such":[142,230],"as":[143],"image":[144],"question":[145],"answering":[146],"visual":[148,168,185],"reasoning.":[149],"We":[150],"also":[151],"illustrate":[152],"importance":[154],"single-modality":[156],"in":[158,197],"Next,":[161],"improve":[163],"via":[166],"dense":[167],"supervision":[169],"show":[171],"its":[172],"generalization":[173],"pure-text":[175],"develop":[178],"vokenization":[180],"method":[181],"construct":[183],"this":[184],"supervision,":[186],"which":[187],"retrieve":[190],"related":[191],"images":[192],"contextualized":[195],"token":[196],"sentence.":[199],"Lastly,":[200],"led":[208],"by":[209],"different":[210],"pretext":[211],"tasks:":[212],"modeling":[214],"contrastive":[216],"two":[221],"into":[223],"unified":[225],"videos,":[229],"could":[235],"capture":[236],"both":[237],"static":[238],"spatial":[239],"contents":[240],"dynamic":[242],"temporal":[243],"interactions.":[244]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
