{"id":"https://openalex.org/W2889266022","doi":"https://doi.org/10.18653/v1/d18-1169","title":"Card-660: Cambridge Rare Word Dataset - a Reliable Benchmark for Infrequent Word Representation Models","display_name":"Card-660: Cambridge Rare Word Dataset - a Reliable Benchmark for Infrequent Word Representation Models","publication_year":2018,"publication_date":"2018-01-01","ids":{"openalex":"https://openalex.org/W2889266022","doi":"https://doi.org/10.18653/v1/d18-1169","mag":"2889266022"},"language":"en","primary_location":{"id":"doi:10.18653/v1/d18-1169","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/d18-1169","pdf_url":"https://www.aclweb.org/anthology/D18-1169.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.aclweb.org/anthology/D18-1169.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5091017313","display_name":"Mohammad Taher Pilehvar","orcid":"https://orcid.org/0000-0003-3694-4006"},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Mohammad Taher Pilehvar","raw_affiliation_strings":["Language Technology Lab, Department of Theoretical and Applied Linguistics University of Cambridge, United Kingdom","University of Cambridge, Cambridge, United Kingdom"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Language Technology Lab, Department of Theoretical and Applied Linguistics University of Cambridge, United Kingdom","institution_ids":["https://openalex.org/I241749"]},{"raw_affiliation_string":"University of Cambridge, Cambridge, United Kingdom","institution_ids":["https://openalex.org/I241749"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032324612","display_name":"Dimitri Kartsaklis","orcid":null},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]},{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Dimitri Kartsaklis","raw_affiliation_strings":["Language Technology Lab, Department of Theoretical and Applied Linguistics University of Cambridge, United Kingdom","Queen Mary University of London, London, United Kingdom"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Language Technology Lab, Department of Theoretical and Applied Linguistics University of Cambridge, United Kingdom","institution_ids":["https://openalex.org/I241749"]},{"raw_affiliation_string":"Queen Mary University of London, London, United Kingdom","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087598239","display_name":"Victor Prokhorov","orcid":null},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Victor Prokhorov","raw_affiliation_strings":["Language Technology Lab, Department of Theoretical and Applied Linguistics University of Cambridge, United Kingdom","University of Cambridge, Cambridge, United Kingdom"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Language Technology Lab, Department of Theoretical and Applied Linguistics University of Cambridge, United Kingdom","institution_ids":["https://openalex.org/I241749"]},{"raw_affiliation_string":"University of Cambridge, Cambridge, United Kingdom","institution_ids":["https://openalex.org/I241749"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5073413742","display_name":"Nigel Collier","orcid":"https://orcid.org/0000-0002-7230-4164"},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Nigel Collier","raw_affiliation_strings":["Language Technology Lab, Department of Theoretical and Applied Linguistics University of Cambridge, United Kingdom","University of Cambridge, Cambridge, United Kingdom"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Language Technology Lab, Department of Theoretical and Applied Linguistics University of Cambridge, United Kingdom","institution_ids":["https://openalex.org/I241749"]},{"raw_affiliation_string":"University of Cambridge, Cambridge, United Kingdom","institution_ids":["https://openalex.org/I241749"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.5069,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.74122087,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"1391","last_page":"1401"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8101452589035034},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.7637894749641418},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7588224411010742},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6619943380355835},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5955199003219604},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5574510097503662},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.5342295169830322},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5205867886543274},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4416457414627075},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.42861688137054443},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.0949404239654541}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8101452589035034},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.7637894749641418},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7588224411010742},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6619943380355835},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5955199003219604},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5574510097503662},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.5342295169830322},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5205867886543274},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4416457414627075},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.42861688137054443},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0949404239654541},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0}],"mesh":[],"locations_count":6,"locations":[{"id":"doi:10.18653/v1/d18-1169","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/d18-1169","pdf_url":"https://www.aclweb.org/anthology/D18-1169.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1808.09308","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1808.09308","pdf_url":"https://arxiv.org/pdf/1808.09308","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:2889266022","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/1808.09308.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"pmh:oai:www.repository.cam.ac.uk:1810/288010","is_oa":false,"landing_page_url":"https://www.repository.cam.ac.uk/handle/1810/288010","pdf_url":null,"source":{"id":"https://openalex.org/S4306401777","display_name":"Apollo (University of Cambridge)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I241749","host_organization_name":"University of Cambridge","host_organization_lineage":["https://openalex.org/I241749"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference Object"},{"id":"doi:10.48550/arxiv.1808.09308","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1808.09308","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"},{"id":"doi:10.17863/cam.35329","is_oa":true,"landing_page_url":"https://doi.org/10.17863/cam.35329","pdf_url":null,"source":{"id":"https://openalex.org/S7407050737","display_name":"Apollo","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.18653/v1/d18-1169","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/d18-1169","pdf_url":"https://www.aclweb.org/anthology/D18-1169.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.7900000214576721,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G2663634960","display_name":"PheneBank: automatic extraction and validation of a database of human phenotype-disease associations in the scientific literature","funder_award_id":"MR/M025160/1","funder_id":"https://openalex.org/F4320334626","funder_display_name":"Medical Research Council"},{"id":"https://openalex.org/G5094827420","display_name":"SIPHS: Semantic interpretation of personal health messages for generating public health summaries","funder_award_id":"EP/M005089/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G7009347466","display_name":null,"funder_award_id":"MR/M025160/1","funder_id":"https://openalex.org/F4320334626","funder_display_name":"Medical Research Council"},{"id":"https://openalex.org/G7718721289","display_name":null,"funder_award_id":"EP/M005089/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334626","display_name":"Medical Research Council","ror":"https://ror.org/03x94j517"},{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2889266022.pdf","grobid_xml":"https://content.openalex.org/works/W2889266022.grobid-xml"},"referenced_works_count":30,"referenced_works":["https://openalex.org/W22168010","https://openalex.org/W1573498319","https://openalex.org/W1854884267","https://openalex.org/W2047782770","https://openalex.org/W2053921957","https://openalex.org/W2064675550","https://openalex.org/W2080100102","https://openalex.org/W2094728533","https://openalex.org/W2112184938","https://openalex.org/W2113459411","https://openalex.org/W2129250947","https://openalex.org/W2155870214","https://openalex.org/W2163302275","https://openalex.org/W2250539671","https://openalex.org/W2250799184","https://openalex.org/W2251012068","https://openalex.org/W2294970769","https://openalex.org/W2296600977","https://openalex.org/W2418340875","https://openalex.org/W2427527485","https://openalex.org/W2493916176","https://openalex.org/W2601529995","https://openalex.org/W2610242619","https://openalex.org/W2613312549","https://openalex.org/W2737092125","https://openalex.org/W2738321088","https://openalex.org/W2753628379","https://openalex.org/W2949679234","https://openalex.org/W2950577311","https://openalex.org/W2964207259"],"related_works":["https://openalex.org/W2954699761","https://openalex.org/W3208211829","https://openalex.org/W2250734228","https://openalex.org/W2954397822","https://openalex.org/W2952094673","https://openalex.org/W2952531073","https://openalex.org/W3152867187","https://openalex.org/W2972252470","https://openalex.org/W2399340618","https://openalex.org/W2510721067","https://openalex.org/W2963863453","https://openalex.org/W2547322381","https://openalex.org/W2743354079","https://openalex.org/W3166339217","https://openalex.org/W2885064623","https://openalex.org/W2098419420","https://openalex.org/W2813700965","https://openalex.org/W3045913839","https://openalex.org/W2808633496","https://openalex.org/W2963619022"],"abstract_inverted_index":{"Rare":[0,54,83],"word":[1,84,89,102,117],"representation":[2,103],"has":[3],"recently":[4],"enjoyed":[5],"a":[6,30,69,94,106,141],"surge":[7],"of":[8,18,32,39,108,121,144],"interest,":[9],"owing":[10],"to":[11,75,128,140],"the":[12,48,114,137,148,151],"crucial":[13],"role":[14],"that":[15,47,112],"effective":[16],"handling":[17],"infrequent":[19],"words":[20,122],"can":[21],"play":[22],"in":[23,44,123],"accurate":[24],"semantic":[25],"understanding.":[26],"However,":[27],"there":[28],"is":[29],"paucity":[31],"reliable":[33],"benchmarks":[34],"for":[35,100],"evaluation":[36,78],"and":[37,61,150],"comparison":[38,71],"these":[40],"techniques.":[41,104],"We":[42,146],"show":[43,111],"this":[45,77],"paper":[46],"only":[49],"existing":[50],"benchmark":[51,99],"(the":[52],"Stanford":[53],"Word":[55],"dataset)":[56],"suffers":[57],"from":[58],"low-confidence":[59],"annotations":[60],"limited":[62],"vocabulary;":[63],"hence,":[64],"it":[65],"does":[66],"not":[67],"constitute":[68],"solid":[70],"framework.":[72],"In":[73],"order":[74],"fill":[76],"gap,":[79],"we":[80,110],"propose":[81],"CAmbridge":[82],"Dataset":[85],"(CARD-660),":[86],"an":[87],"expert-annotated":[88],"similarity":[90],"dataset":[91,149],"which":[92],"provides":[93],"highly":[95],"reliable,":[96],"yet":[97],"challenging,":[98],"rare":[101],"Through":[105],"set":[107],"experiments":[109],"even":[113],"best":[115],"mainstream":[116],"embeddings,":[118],"with":[119],"millions":[120],"their":[124],"vocabularies,":[125],"are":[126],"unable":[127],"achieve":[129],"performances":[130],"higher":[131],"than":[132],"0.43":[133],"(Pearson":[134],"correlation)":[135],"on":[136],"dataset,":[138],"compared":[139],"human-level":[142],"upperbound":[143],"0.90.":[145],"release":[147],"annotation":[152],"materials":[153],"at":[154],"https://":[155],"pilehvar.github.io/card-660/.":[156]},"counts_by_year":[{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
