{"id":"https://openalex.org/W2941492599","doi":"https://doi.org/10.21437/interspeech.2019-3051","title":"On the Contributions of Visual and Textual Supervision in Low-Resource Semantic Speech Retrieval","display_name":"On the Contributions of Visual and Textual Supervision in Low-Resource Semantic Speech Retrieval","publication_year":2019,"publication_date":"2019-09-13","ids":{"openalex":"https://openalex.org/W2941492599","doi":"https://doi.org/10.21437/interspeech.2019-3051","mag":"2941492599"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2019-3051","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2019-3051","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2019","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1904.10947","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059672669","display_name":"Ankita Pasad","orcid":null},"institutions":[{"id":"https://openalex.org/I160992636","display_name":"Toyota Technological Institute at Chicago","ror":"https://ror.org/02sn5gb64","country_code":"US","type":"education","lineage":["https://openalex.org/I160992636"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ankita Pasad","raw_affiliation_strings":["Toyota Technological Institute at Chicago,"],"affiliations":[{"raw_affiliation_string":"Toyota Technological Institute at Chicago,","institution_ids":["https://openalex.org/I160992636"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085193896","display_name":"Bowen Shi","orcid":"https://orcid.org/0000-0002-4753-7572"},"institutions":[{"id":"https://openalex.org/I160992636","display_name":"Toyota Technological Institute at Chicago","ror":"https://ror.org/02sn5gb64","country_code":"US","type":"education","lineage":["https://openalex.org/I160992636"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bowen Shi","raw_affiliation_strings":["Toyota Technological Institute at Chicago,"],"affiliations":[{"raw_affiliation_string":"Toyota Technological Institute at Chicago,","institution_ids":["https://openalex.org/I160992636"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040305929","display_name":"Herman Kamper","orcid":"https://orcid.org/0000-0003-2980-3475"},"institutions":[{"id":"https://openalex.org/I160992636","display_name":"Toyota Technological Institute at Chicago","ror":"https://ror.org/02sn5gb64","country_code":"US","type":"education","lineage":["https://openalex.org/I160992636"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Herman Kamper","raw_affiliation_strings":["Toyota Technological Institute at Chicago,"],"affiliations":[{"raw_affiliation_string":"Toyota Technological Institute at Chicago,","institution_ids":["https://openalex.org/I160992636"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5015602781","display_name":"Karen Livescu","orcid":"https://orcid.org/0000-0003-4962-946X"},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Karen Livescu","raw_affiliation_strings":["StellenBosch University"],"affiliations":[{"raw_affiliation_string":"StellenBosch University","institution_ids":["https://openalex.org/I26092322"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5059672669"],"corresponding_institution_ids":["https://openalex.org/I160992636"],"apc_list":null,"apc_paid":null,"fwci":0.1022,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.41432298,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"4195","last_page":"4199"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7915551066398621},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7611123323440552},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.701225757598877},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6171183586120605},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.5330414175987244},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.525276780128479},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5214880704879761},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5019900798797607},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.42767518758773804},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.420119047164917},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.4138319790363312},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3927334249019623},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.353272408246994}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7915551066398621},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7611123323440552},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.701225757598877},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6171183586120605},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.5330414175987244},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.525276780128479},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5214880704879761},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5019900798797607},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.42767518758773804},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.420119047164917},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.4138319790363312},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3927334249019623},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.353272408246994},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.21437/interspeech.2019-3051","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2019-3051","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2019","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1904.10947","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1904.10947","pdf_url":"https://arxiv.org/pdf/1904.10947","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:2941492599","is_oa":true,"landing_page_url":"https://arxiv.org/abs/1904.10947","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.1904.10947","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1904.10947","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1904.10947","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1904.10947","pdf_url":"https://arxiv.org/pdf/1904.10947","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.8199999928474426,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2941492599.pdf","grobid_xml":"https://content.openalex.org/works/W2941492599.grobid-xml"},"referenced_works_count":30,"referenced_works":["https://openalex.org/W385555557","https://openalex.org/W1861492603","https://openalex.org/W1984076147","https://openalex.org/W2070753207","https://openalex.org/W2126203737","https://openalex.org/W2148986421","https://openalex.org/W2168119002","https://openalex.org/W2171019095","https://openalex.org/W2185175083","https://openalex.org/W2187089797","https://openalex.org/W2194775991","https://openalex.org/W2516255829","https://openalex.org/W2531381952","https://openalex.org/W2556930864","https://openalex.org/W2566587499","https://openalex.org/W2586148577","https://openalex.org/W2884975363","https://openalex.org/W2927673779","https://openalex.org/W2950133079","https://openalex.org/W2962753610","https://openalex.org/W2962862718","https://openalex.org/W2963168538","https://openalex.org/W2963303028","https://openalex.org/W2963525826","https://openalex.org/W2963902314","https://openalex.org/W2964001192","https://openalex.org/W2964099072","https://openalex.org/W2964115348","https://openalex.org/W2964121744","https://openalex.org/W2964222437"],"related_works":["https://openalex.org/W2973135958","https://openalex.org/W2903320905","https://openalex.org/W2601713192","https://openalex.org/W2906407728","https://openalex.org/W3167119498","https://openalex.org/W2964099072","https://openalex.org/W3032892481","https://openalex.org/W2963902314","https://openalex.org/W3015356123","https://openalex.org/W3040260790","https://openalex.org/W2960271609","https://openalex.org/W3112385539","https://openalex.org/W2522415541","https://openalex.org/W3197567540","https://openalex.org/W2809767522","https://openalex.org/W2556930864","https://openalex.org/W2912641852","https://openalex.org/W3174321708","https://openalex.org/W2949642982","https://openalex.org/W3209871323"],"abstract_inverted_index":{"Recent":[0],"work":[1],"has":[2],"shown":[3],"that":[4,122],"speech":[5,16,62],"paired":[6],"with":[7,83,107],"images":[8,82],"can":[9],"be":[10],"used":[11],"to":[12,32,100],"learn":[13],"semantically":[14],"meaningful":[15],"representations":[17],"even":[18,127],"without":[19],"any":[20],"textual":[21,52,105,132],"supervision.":[22,53,164],"In":[23,54],"real-world":[24],"low-resource":[25,66],"settings,":[26],"however,":[27],"we":[28,56,135,154],"often":[29],"have":[30],"access":[31],"some":[33],"transcribed":[34,145,152],"speech.":[35],"We":[36,68,94,120],"study":[37],"whether":[38],"and":[39,75,86,104,134],"how":[40],"visual":[41,103,108,123,163],"grounding":[42,124],"is":[43,125],"useful":[44],"in":[45,64,110,128],"the":[46,58,111,129],"presence":[47,130],"of":[48,51,60,91,113,131,142,144,151],"varying":[49],"amounts":[50],"particular,":[55],"consider":[57],"task":[59],"semantic":[61,92],"retrieval":[63],"a":[65,70,96,140],"setting.":[67],"use":[69],"previously":[71],"studied":[72],"data":[73,146],"set":[74],"task,":[76],"where":[77],"models":[78],"are":[79],"trained":[80],"on":[81,88],"spoken":[84],"captions":[85],"evaluated":[87],"human":[89],"judgments":[90],"relevance.":[93],"propose":[95],"multitask":[97],"learning":[98],"approach":[99],"leverage":[101],"both":[102],"modalities,":[106],"supervision":[109],"form":[112],"keyword":[114],"probabilities":[115],"from":[116],"an":[117],"external":[118],"tagger.":[119],"find":[121],"helpful":[126],"supervision,":[133],"analyze":[136],"this":[137],"effect":[138],"over":[139],"range":[141],"sizes":[143],"sets.":[147],"With":[148],"~5":[149],"hours":[150],"speech,":[153],"obtain":[155],"23%":[156],"higher":[157],"average":[158],"precision":[159],"when":[160],"also":[161],"using":[162]},"counts_by_year":[{"year":2022,"cited_by_count":1}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
