{"id":"https://openalex.org/W2945941062","doi":"https://doi.org/10.1145/3318299.3318369","title":"Text Deduplication with Minimum Loss Ratio","display_name":"Text Deduplication with Minimum Loss Ratio","publication_year":2019,"publication_date":"2019-02-22","ids":{"openalex":"https://openalex.org/W2945941062","doi":"https://doi.org/10.1145/3318299.3318369","mag":"2945941062"},"language":"en","primary_location":{"id":"doi:10.1145/3318299.3318369","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3318299.3318369","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2019 11th International Conference on Machine Learning and Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067282682","display_name":"Youming Ge","orcid":"https://orcid.org/0000-0001-5995-3242"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Youming Ge","raw_affiliation_strings":["Sun Yat-Sen University, Guangzhou, P.R. China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Sun Yat-Sen University, Guangzhou, P.R. China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084941909","display_name":"Jiefeng Wu","orcid":"https://orcid.org/0000-0002-2329-7834"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiefeng Wu","raw_affiliation_strings":["Sun Yat-Sen University, Guangzhou, P.R. China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Sun Yat-Sen University, Guangzhou, P.R. China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018994926","display_name":"Genan Dai","orcid":"https://orcid.org/0000-0003-2583-0433"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Genan Dai","raw_affiliation_strings":["Sun Yat-Sen University, Guangzhou, P.R. China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Sun Yat-Sen University, Guangzhou, P.R. China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5020985581","display_name":"Yubao Liu","orcid":"https://orcid.org/0000-0002-5027-227X"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yubao Liu","raw_affiliation_strings":["Sun Yat-Sen University, Guangzhou, P.R. China and Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou, P. R. China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Sun Yat-Sen University, Guangzhou, P.R. China and Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou, P. R. China","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.1446,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.56238753,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"310","last_page":"316"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7978246212005615},{"id":"https://openalex.org/keywords/data-deduplication","display_name":"Data deduplication","score":0.7950685620307922},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6799015998840332},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5413522720336914},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5187311768531799},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4931764006614685},{"id":"https://openalex.org/keywords/greedy-algorithm","display_name":"Greedy algorithm","score":0.47555679082870483},{"id":"https://openalex.org/keywords/text-graph","display_name":"Text graph","score":0.4392751455307007},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3624674677848816},{"id":"https://openalex.org/keywords/text-mining","display_name":"Text mining","score":0.3024919033050537},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.26863664388656616},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.268038809299469},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.22168979048728943},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.17064619064331055},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.10507917404174805}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7978246212005615},{"id":"https://openalex.org/C32587265","wikidata":"https://www.wikidata.org/wiki/Q1182260","display_name":"Data deduplication","level":2,"score":0.7950685620307922},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6799015998840332},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5413522720336914},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5187311768531799},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4931764006614685},{"id":"https://openalex.org/C51823790","wikidata":"https://www.wikidata.org/wiki/Q504353","display_name":"Greedy algorithm","level":2,"score":0.47555679082870483},{"id":"https://openalex.org/C66945725","wikidata":"https://www.wikidata.org/wiki/Q18388823","display_name":"Text graph","level":3,"score":0.4392751455307007},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3624674677848816},{"id":"https://openalex.org/C71472368","wikidata":"https://www.wikidata.org/wiki/Q676880","display_name":"Text mining","level":2,"score":0.3024919033050537},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26863664388656616},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.268038809299469},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.22168979048728943},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.17064619064331055},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.10507917404174805},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3318299.3318369","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3318299.3318369","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2019 11th International Conference on Machine Learning and Computing","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335795","display_name":"Science and Technology Planning Project of Guangdong Province","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W25706487","https://openalex.org/W1544505227","https://openalex.org/W1966401207","https://openalex.org/W1968889737","https://openalex.org/W1987777228","https://openalex.org/W2001496424","https://openalex.org/W2004025009","https://openalex.org/W2009276147","https://openalex.org/W2012451152","https://openalex.org/W2012833704","https://openalex.org/W2031655803","https://openalex.org/W2042641525","https://openalex.org/W2065259291","https://openalex.org/W2067432306","https://openalex.org/W2085922539","https://openalex.org/W2106895292","https://openalex.org/W2109803107","https://openalex.org/W2132313482","https://openalex.org/W2148885851","https://openalex.org/W2150815390","https://openalex.org/W2223881431","https://openalex.org/W2285211177","https://openalex.org/W2401610261"],"related_works":["https://openalex.org/W2770471982","https://openalex.org/W2770474375","https://openalex.org/W4384067529","https://openalex.org/W1625494842","https://openalex.org/W2604161433","https://openalex.org/W2152349655","https://openalex.org/W2372183225","https://openalex.org/W3133621124","https://openalex.org/W2389119968","https://openalex.org/W2365299969"],"abstract_inverted_index":{"Text":[0],"deduplication":[1],"is":[2,46],"an":[3],"important":[4],"operation":[5],"for":[6,59,109,125,137],"text":[7,15,23,41,53,60,81,99,129],"document":[8,149],"analysis":[9],"applications.":[10],"Given":[11],"a":[12,106,126],"set":[13,38,51,75,79,127],"of":[14,39,52,80,98,116,128,154],"documents,":[16],"we":[17,65],"often":[18],"need":[19],"to":[20,43,71,85,101],"remove":[21],"the":[22,32,37,49,67,73,77,87,95,114,122,134,138,146,152,155],"documents":[24,42,54,100],"whose":[25],"similarity":[26,117],"values":[27],"are":[28],"not":[29,57],"less":[30],"than":[31],"specified":[33],"threshold.":[34],"However,":[35],"if":[36],"similar":[40,123],"be":[44,56,102],"removed":[45,74],"too":[47],"large,":[48],"remaining":[50,78],"may":[55],"enough":[58],"analysis.":[61],"In":[62],"this":[63],"paper,":[64],"consider":[66,133],"problem":[68,111],"on":[69,113,145],"how":[70],"balance":[72],"and":[76],"documents.":[82,130],"We":[83,104,131],"try":[84],"reduce":[86],"duplication":[88],"information":[89],"as":[90,92],"much":[91],"possible":[93],"with":[94],"minimum":[96],"number":[97],"removed.":[103],"propose":[105],"greedy":[107],"algorithm":[108,136],"our":[110],"based":[112,144],"concept":[115],"graph":[118],"which":[119],"can":[120],"represent":[121],"relationship":[124],"also":[132],"incremental":[135],"dynamic":[139],"settings.":[140],"The":[141],"experimental":[142],"results":[143],"real":[147],"news":[148],"datasets":[150],"show":[151],"efficiency":[153],"proposed":[156],"algorithms.":[157]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
