{"id":"https://openalex.org/W4224980447","doi":"https://doi.org/10.1145/3477495.3531951","title":"CharacterBERT and Self-Teaching for Improving the Robustness of Dense Retrievers on Queries with Typos","display_name":"CharacterBERT and Self-Teaching for Improving the Robustness of Dense Retrievers on Queries with Typos","publication_year":2022,"publication_date":"2022-07-06","ids":{"openalex":"https://openalex.org/W4224980447","doi":"https://doi.org/10.1145/3477495.3531951"},"language":"en","primary_location":{"id":"doi:10.1145/3477495.3531951","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3477495.3531951","pdf_url":null,"source":{"id":"https://openalex.org/S4363608773","display_name":"Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2204.00716","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5012958162","display_name":"Shengyao Zhuang","orcid":"https://orcid.org/0000-0002-6711-0955"},"institutions":[{"id":"https://openalex.org/I165143802","display_name":"University of Queensland","ror":"https://ror.org/00rqy9422","country_code":"AU","type":"education","lineage":["https://openalex.org/I165143802"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Shengyao Zhuang","raw_affiliation_strings":["The University of Queensland, Brisbane, QLD, Australia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Queensland, Brisbane, QLD, Australia","institution_ids":["https://openalex.org/I165143802"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5076031002","display_name":"Guido Zuccon","orcid":"https://orcid.org/0000-0003-0271-5563"},"institutions":[{"id":"https://openalex.org/I165143802","display_name":"University of Queensland","ror":"https://ror.org/00rqy9422","country_code":"AU","type":"education","lineage":["https://openalex.org/I165143802"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Guido Zuccon","raw_affiliation_strings":["The University of Queensland, Brisbane, QLD, Australia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Queensland, Brisbane, QLD, Australia","institution_ids":["https://openalex.org/I165143802"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5012958162"],"corresponding_institution_ids":["https://openalex.org/I165143802"],"apc_list":null,"apc_paid":null,"fwci":2.2948,"has_fulltext":false,"cited_by_count":24,"citation_normalized_percentile":{"value":0.89941706,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1444","last_page":"1454"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9947999715805054,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8312376737594604},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.7305289506912231},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5525161027908325},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5221776962280273},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.47858166694641113},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3655690550804138},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3601788878440857},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.10627159476280212},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.08602184057235718}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8312376737594604},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.7305289506912231},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5525161027908325},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5221776962280273},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.47858166694641113},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3655690550804138},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3601788878440857},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.10627159476280212},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.08602184057235718},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3477495.3531951","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3477495.3531951","pdf_url":null,"source":{"id":"https://openalex.org/S4363608773","display_name":"Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2204.00716","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2204.00716","pdf_url":"https://arxiv.org/pdf/2204.00716","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2204.00716","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2204.00716","pdf_url":"https://arxiv.org/pdf/2204.00716","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.7300000190734863,"id":"https://metadata.un.org/sdg/1","display_name":"No poverty"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":55,"referenced_works":["https://openalex.org/W1821462560","https://openalex.org/W1967526029","https://openalex.org/W1982858363","https://openalex.org/W2007807439","https://openalex.org/W2081058719","https://openalex.org/W2127838257","https://openalex.org/W2259472270","https://openalex.org/W2295779180","https://openalex.org/W2525778437","https://openalex.org/W2741195357","https://openalex.org/W2750779823","https://openalex.org/W2896457183","https://openalex.org/W2951534261","https://openalex.org/W2962739339","https://openalex.org/W2965373594","https://openalex.org/W2970597249","https://openalex.org/W2979826702","https://openalex.org/W3011279327","https://openalex.org/W3021397474","https://openalex.org/W3022373106","https://openalex.org/W3034439313","https://openalex.org/W3037128914","https://openalex.org/W3093955333","https://openalex.org/W3099700870","https://openalex.org/W3115462295","https://openalex.org/W3118668786","https://openalex.org/W3152887675","https://openalex.org/W3155895380","https://openalex.org/W3166441238","https://openalex.org/W3168875417","https://openalex.org/W3174367567","https://openalex.org/W3175111331","https://openalex.org/W3180230246","https://openalex.org/W3184918446","https://openalex.org/W3185250692","https://openalex.org/W3189509741","https://openalex.org/W3193342167","https://openalex.org/W3195010973","https://openalex.org/W3198691721","https://openalex.org/W3201233724","https://openalex.org/W3206455169","https://openalex.org/W3208821253","https://openalex.org/W3214779765","https://openalex.org/W4206121183","https://openalex.org/W4221164176","https://openalex.org/W4225728306","https://openalex.org/W4238430687","https://openalex.org/W4284697650","https://openalex.org/W4286905174","https://openalex.org/W4287180852","https://openalex.org/W4287185415","https://openalex.org/W4287645694","https://openalex.org/W4324016655","https://openalex.org/W6600263792","https://openalex.org/W6601899773"],"related_works":["https://openalex.org/W4386014872","https://openalex.org/W1847536016","https://openalex.org/W4361193986","https://openalex.org/W3149094754","https://openalex.org/W2148703997","https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W4298195702","https://openalex.org/W3093768914","https://openalex.org/W2945402993"],"abstract_inverted_index":{"Current":[0],"dense":[1,59,124,133,254],"retrievers":[2],"are":[3,137,261],"not":[4],"robust":[5,138,253],"to":[6,111,118,131,139,205],"out-of-domain":[7],"and":[8,89,165,212,233,252,259],"outlier":[9],"queries,":[10],"i.e.":[11],"their":[12],"effectiveness":[13,57,199],"on":[14,152,200,237],"these":[15,210],"queries":[16,36,49,141,153,179,184,201,230],"is":[17,82],"much":[18],"poorer":[19],"than":[20],"what":[21],"one":[22],"would":[23],"expect.":[24],"In":[25,79],"this":[26,69],"paper,":[27],"we":[28,90,158,219],"consider":[29],"a":[30,43,93,96,222],"specific":[31],"instance":[32],"of":[33,58,68,123,216,228,250],"such":[34,140],"queries:":[35],"that":[37,42,64,92,136,175,190],"contain":[38],"typos.":[39,155,186],"We":[40,61,126],"show":[41,91,189],"small":[44],"character":[45],"level":[46],"perturbation":[47],"in":[48,71,113,192,247],"(as":[50],"caused":[51],"by":[52,77],"typos)":[53],"highly":[54],"impacts":[55],"the":[56,65,72,85,101,114,119,162,183,213,217,238,244,248],"retrievers.":[60,125,255],"then":[62,127],"demonstrate":[63],"root":[66],"cause":[67],"resides":[70],"input":[73,115],"tokenization":[74,81],"strategy":[75],"employed":[76],"BERT.":[78],"BERT,":[80],"performed":[83],"using":[84],"BERT's":[86],"WordPiece":[87],"tokenizer":[88],"token":[94,102],"with":[95,142,185,194,202,209,231],"typo":[97],"will":[98],"significantly":[99,197],"change":[100,109],"distributions":[103],"obtained":[104],"after":[105],"tokenization.":[106],"This":[107],"distribution":[108],"translates":[110],"changes":[112],"embeddings":[116],"passed":[117],"BERT-based":[120],"query":[121],"encoder":[122,164],"turn":[128],"our":[129],"attention":[130],"devising":[132],"retriever":[134],"methods":[135,151],"typos,":[143],"while":[144],"still":[145],"being":[146],"as":[147,149,161],"performant":[148],"previous":[150,206],"without":[154,180],"For":[156],"this,":[157],"use":[159],"CharacterBERT":[160,191],"backbone":[163],"an":[166],"efficient":[167],"yet":[168],"effective":[169,251],"training":[170],"method,":[171],"called":[172],"Self-Teaching":[173],"(ST),":[174],"distills":[176],"knowledge":[177],"from":[178],"typos":[181,203,232],"into":[182],"Experimental":[187],"results":[188,211,258],"combination":[193],"ST":[195],"achieves":[196],"higher":[198],"compared":[204],"methods.":[207],"Along":[208],"open-sourced":[214],"implementation":[215],"methods,":[218],"also":[220],"provide":[221],"new":[223],"passage":[224],"retrieval":[225],"dataset":[226,260],"consisting":[227],"real-world":[229],"associated":[234],"relevance":[235],"assessments":[236],"MS":[239],"MARCO":[240],"corpus,":[241],"thus":[242],"supporting":[243],"research":[245],"community":[246],"investigation":[249],"Code,":[256],"experimental":[257],"made":[262],"available":[263],"at":[264],"https://github.com/ielab/CharacterBERT-DR.":[265]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":11},{"year":2022,"cited_by_count":1}],"updated_date":"2026-04-28T14:05:53.105641","created_date":"2022-04-28T00:00:00"}
