{"id":"https://openalex.org/W3208996364","doi":"https://doi.org/10.1145/3459637.3482008","title":"Machamp: A Generalized Entity Matching Benchmark","display_name":"Machamp: A Generalized Entity Matching Benchmark","publication_year":2021,"publication_date":"2021-10-26","ids":{"openalex":"https://openalex.org/W3208996364","doi":"https://doi.org/10.1145/3459637.3482008","mag":"3208996364"},"language":"en","primary_location":{"id":"doi:10.1145/3459637.3482008","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3459637.3482008","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Information &amp; Knowledge Management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100346193","display_name":"Jin Wang","orcid":"https://orcid.org/0000-0002-9962-9401"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin Wang","raw_affiliation_strings":["Megagon Labs, Mountain View, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Megagon Labs, Mountain View, CA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100750716","display_name":"Yuliang Li","orcid":"https://orcid.org/0000-0002-0602-149X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuliang Li","raw_affiliation_strings":["Megagon Labs, Mountain View, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Megagon Labs, Mountain View, CA, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5000108670","display_name":"Wataru Hirota","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wataru Hirota","raw_affiliation_strings":["Megagon Labs, Mountain View, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Megagon Labs, Mountain View, CA, USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":3.0472,"has_fulltext":false,"cited_by_count":28,"citation_normalized_percentile":{"value":0.91520097,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"4633","last_page":"4642"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9764999747276306,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9757999777793884,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8485679626464844},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.8376513719558716},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.6294992566108704},{"id":"https://openalex.org/keywords/schema-matching","display_name":"Schema matching","score":0.6230236887931824},{"id":"https://openalex.org/keywords/schema","display_name":"Schema (genetic algorithms)","score":0.5423473119735718},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.4873688817024231},{"id":"https://openalex.org/keywords/profiling","display_name":"Profiling (computer programming)","score":0.4807281792163849},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.46719881892204285},{"id":"https://openalex.org/keywords/unstructured-data","display_name":"Unstructured data","score":0.46490538120269775},{"id":"https://openalex.org/keywords/table","display_name":"Table (database)","score":0.411914587020874},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.33792543411254883},{"id":"https://openalex.org/keywords/data-integration","display_name":"Data integration","score":0.26469436287879944},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.2169899046421051}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8485679626464844},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.8376513719558716},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.6294992566108704},{"id":"https://openalex.org/C2777327318","wikidata":"https://www.wikidata.org/wiki/Q1408390","display_name":"Schema matching","level":3,"score":0.6230236887931824},{"id":"https://openalex.org/C52146309","wikidata":"https://www.wikidata.org/wiki/Q7431116","display_name":"Schema (genetic algorithms)","level":2,"score":0.5423473119735718},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4873688817024231},{"id":"https://openalex.org/C187191949","wikidata":"https://www.wikidata.org/wiki/Q1138496","display_name":"Profiling (computer programming)","level":2,"score":0.4807281792163849},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.46719881892204285},{"id":"https://openalex.org/C2781252014","wikidata":"https://www.wikidata.org/wiki/Q1141900","display_name":"Unstructured data","level":3,"score":0.46490538120269775},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.411914587020874},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.33792543411254883},{"id":"https://openalex.org/C72634772","wikidata":"https://www.wikidata.org/wiki/Q386824","display_name":"Data integration","level":2,"score":0.26469436287879944},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.2169899046421051},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3459637.3482008","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3459637.3482008","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Information &amp; Knowledge Management","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.5699999928474426,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W1981590391","https://openalex.org/W2031250218","https://openalex.org/W2073471108","https://openalex.org/W2406114359","https://openalex.org/W2542998387","https://openalex.org/W2798649495","https://openalex.org/W2945883855","https://openalex.org/W2951147191","https://openalex.org/W2957204582","https://openalex.org/W2966720878","https://openalex.org/W2970641574","https://openalex.org/W2979826702","https://openalex.org/W2985009327","https://openalex.org/W3013103751","https://openalex.org/W3014295153","https://openalex.org/W3014705052","https://openalex.org/W3034997167","https://openalex.org/W3084740534","https://openalex.org/W3094039038","https://openalex.org/W3105771849","https://openalex.org/W3119752913","https://openalex.org/W3123375411","https://openalex.org/W3174036215"],"related_works":["https://openalex.org/W2378211422","https://openalex.org/W2745001401","https://openalex.org/W1528218860","https://openalex.org/W3208717180","https://openalex.org/W2406112091","https://openalex.org/W2125859764","https://openalex.org/W2014400548","https://openalex.org/W2029826694","https://openalex.org/W4298139853","https://openalex.org/W2016247499"],"abstract_inverted_index":{"Entity":[0,108],"Matching":[1,109],"(EM)":[2],"refers":[3],"to":[4,15,58,110,158],"the":[5,16,27,59,62,72,76,178,191,197],"problem":[6,105],"of":[7,26,66,92,123,134,153,177,193],"determining":[8],"whether":[9],"two":[10,63],"different":[11,167,210],"data":[12,28,64,79,93,207],"representations":[13],"refer":[14],"same":[17,73],"real-world":[18,90],"entity.":[19],"It":[20,212],"has":[21],"been":[22,34],"a":[23,102,116,151],"long-standing":[24],"interest":[25],"management":[29],"community.":[30],"Many":[31],"efforts":[32,157],"have":[33],"paid":[35],"in":[36,43,78,89,137],"creating":[37],"benchmark":[38,52,117,144],"tasks":[39,53,125,145,163,180],"as":[40,42],"well":[41],"developing":[44],"advanced":[45],"matching":[46,82,162,186],"techniques":[47,205],"for":[48,54,81,119,146],"EM.":[49],"However,":[50],"existing":[51,142],"EM":[55,143,204],"are":[56,68],"limited":[57],"case":[60],"where":[61],"collections":[65,80,208],"entities":[67],"structured":[69,147],"tables":[70,77,148,165],"with":[71,101,166,209],"schema.":[74],"Meanwhile,":[75],"could":[83],"be":[84],"structured,":[85],"semi-structured,":[86],"or":[87],"unstructured":[88],"scenarios":[91],"science.":[94],"In":[95],"this":[96,112],"paper,":[97],"we":[98,172],"come":[99],"up":[100],"new":[103],"research":[104],"-":[106],"Generalized":[107],"satisfy":[111],"requirement":[113],"and":[114,129,149,155,181],"create":[115],"Machamp":[118,121],"it.":[120],"consists":[122],"seven":[124],"having":[126],"diverse":[127],"characteristics":[128],"thus":[130],"provides":[131],"good":[132],"coverage":[133],"use":[135],"cases":[136],"real":[138],"applications.":[139],"We":[140],"summarize":[141],"conduct":[150,174],"series":[152],"processing":[154],"cleaning":[156],"transform":[159],"them":[160],"into":[161],"between":[164,206],"structures.":[168,211],"Based":[169],"on":[170,188],"that,":[171],"further":[173],"comprehensive":[175],"profiling":[176],"proposed":[179],"evaluate":[182,203],"several":[183],"popular":[184],"entity":[185],"approaches":[187],"them.":[189],"With":[190],"help":[192],"Machamp,":[194],"it":[195],"is":[196,213],"first":[198],"time":[199],"that":[200],"researchers":[201],"can":[202],"public":[214],"available":[215],"via":[216],"link:":[217],"https://github.com/megagonlabs/machamp.":[218]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":9},{"year":2024,"cited_by_count":8},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":4}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
