{"id":"https://openalex.org/W3177445587","doi":"https://doi.org/10.1145/3448016.3457548","title":"Allign: Aligning All-Pair Near-Duplicate Passages in Long Texts","display_name":"Allign: Aligning All-Pair Near-Duplicate Passages in Long Texts","publication_year":2021,"publication_date":"2021-06-09","ids":{"openalex":"https://openalex.org/W3177445587","doi":"https://doi.org/10.1145/3448016.3457548","mag":"3177445587"},"language":"en","primary_location":{"id":"doi:10.1145/3448016.3457548","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3448016.3457548","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Management of Data","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5003295104","display_name":"Weiqi Feng","orcid":"https://orcid.org/0009-0004-5853-220X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Weiqi Feng","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103237059","display_name":"Dong Deng","orcid":"https://orcid.org/0000-0002-4596-3850"},"institutions":[{"id":"https://openalex.org/I102322142","display_name":"Rutgers, The State University of New Jersey","ror":"https://ror.org/05vt9qd57","country_code":"US","type":"education","lineage":["https://openalex.org/I102322142"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dong Deng","raw_affiliation_strings":["Rutgers University, Piscataway, NJ, USA"],"affiliations":[{"raw_affiliation_string":"Rutgers University, Piscataway, NJ, USA","institution_ids":["https://openalex.org/I102322142"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5003295104"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":1.5685,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.83307805,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"541","last_page":"553"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.9843999743461609,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/hash-function","display_name":"Hash function","score":0.8064889907836914},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6895159482955933},{"id":"https://openalex.org/keywords/perfect-hash-function","display_name":"Perfect hash function","score":0.5291904211044312},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.514430582523346},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.4287703335285187},{"id":"https://openalex.org/keywords/hash-table","display_name":"Hash table","score":0.3749815821647644},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.3294142484664917},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.171356201171875},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.14483743906021118},{"id":"https://openalex.org/keywords/genetics","display_name":"Genetics","score":0.0756731927394867}],"concepts":[{"id":"https://openalex.org/C99138194","wikidata":"https://www.wikidata.org/wiki/Q183427","display_name":"Hash function","level":2,"score":0.8064889907836914},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6895159482955933},{"id":"https://openalex.org/C87431388","wikidata":"https://www.wikidata.org/wiki/Q2070573","display_name":"Perfect hash function","level":4,"score":0.5291904211044312},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.514430582523346},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.4287703335285187},{"id":"https://openalex.org/C67388219","wikidata":"https://www.wikidata.org/wiki/Q207440","display_name":"Hash table","level":3,"score":0.3749815821647644},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3294142484664917},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.171356201171875},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.14483743906021118},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0756731927394867},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3448016.3457548","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3448016.3457548","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Management of Data","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.5199999809265137,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":49,"referenced_works":["https://openalex.org/W1455310343","https://openalex.org/W1575370549","https://openalex.org/W1790180460","https://openalex.org/W1973001156","https://openalex.org/W1974336599","https://openalex.org/W1991175610","https://openalex.org/W2007842132","https://openalex.org/W2053017876","https://openalex.org/W2092782467","https://openalex.org/W2097184821","https://openalex.org/W2097776316","https://openalex.org/W2099370490","https://openalex.org/W2104671598","https://openalex.org/W2109803107","https://openalex.org/W2111295912","https://openalex.org/W2119455368","https://openalex.org/W2121269638","https://openalex.org/W2121516976","https://openalex.org/W2127434515","https://openalex.org/W2139660688","https://openalex.org/W2148578434","https://openalex.org/W2156855109","https://openalex.org/W2164634022","https://openalex.org/W2167302605","https://openalex.org/W2167847032","https://openalex.org/W2169054943","https://openalex.org/W2218762779","https://openalex.org/W2261895596","https://openalex.org/W2294331997","https://openalex.org/W2397770138","https://openalex.org/W2430378630","https://openalex.org/W2515119766","https://openalex.org/W2522338500","https://openalex.org/W2566373727","https://openalex.org/W2599156210","https://openalex.org/W2795518213","https://openalex.org/W2798412430","https://openalex.org/W2920553042","https://openalex.org/W3102192406","https://openalex.org/W3105727767","https://openalex.org/W4251391716","https://openalex.org/W6629956336","https://openalex.org/W6632555176","https://openalex.org/W6636190696","https://openalex.org/W6679663036","https://openalex.org/W6682042839","https://openalex.org/W6682678078","https://openalex.org/W6986298011","https://openalex.org/W7004726404"],"related_works":["https://openalex.org/W2097286495","https://openalex.org/W2080388000","https://openalex.org/W2065331859","https://openalex.org/W2155123971","https://openalex.org/W1897694601","https://openalex.org/W1845395494","https://openalex.org/W2144265691","https://openalex.org/W1835589799","https://openalex.org/W1605991620","https://openalex.org/W4385261619"],"abstract_inverted_index":{"In":[0,189,259],"this":[1,51,77,80,169,260],"paper,":[2,81],"we":[3,82,287],"study":[4],"the":[5,36,46,64,109,114,182,199,222,237,252,265,277,315],"problem":[6],"of":[7,21,50,66,126,176,211,227,244,256,268],"aligning":[8],"all-pair":[9],"near-duplicate":[10,92,279],"passages":[11,146,180,201,280],"in":[12,24,35,79,95,108,135,158,202,214],"two":[13,96,159,215],"long":[14,97],"texts.":[15,98],"A":[16],"passage":[17,93,107,115,138,156,238,269],"is":[18,150],"a":[19,25,40,84,101,124,173,186,203,209,292],"sequence":[20],"consecutive":[22],"words":[23],"text.":[26],"It":[27],"can":[28,62],"begin":[29],"and":[30,72,111,130,144,161,178,283,302],"end":[31],"with":[32,128,181,205],"any":[33],"word":[34],"text,":[37],"whether":[38],"around":[39],"period":[41],"or":[42],"not.":[43],"Due":[44],"to":[45,57,89,153,185,196,275,290],"high":[47],"computation":[48],"cost":[49],"problem,":[52,78],"existing":[53],"work":[54],"all":[55,91,113,155,198,236],"compromise":[56],"heuristic":[58],"alignment":[59,318],"methods,":[60],"which":[61,249],"harm":[63],"recall":[65],"downstream":[67],"applications,":[68],"such":[69],"as":[70],"deduplication":[71],"plagiarism":[73],"detection.":[74],"To":[75,167],"address":[76,168],"propose":[83],"min-hash":[85,103,120,165,184,257],"based":[86],"method":[87],"Allign":[88,99,171,191,234,262,284,312],"find":[90],"pairs":[94,116,139,157,239],"generates":[100,192],"few":[102,293],"values":[104],"for":[105,123],"each":[106],"texts":[110,127,160,216],"reports":[112,235],"sharing":[117],"enough":[118,242,254],"common":[119,164],"values.":[121,166,258],"However,":[122],"pair":[125,210],"n":[129,206],"m":[131],"words,":[132],"there":[133],"are":[134,217,231],"total":[136],"O(n2m2)":[137],"(each":[140],"text":[141,204,317],"contains":[142],"O(n2)":[143,200],"O(m2)":[145],"respectively).":[147],"Thus":[148],"it":[149],"prohibitively":[151],"expensive":[152],"enumerate":[154],"count":[162],"their":[163],"issue,":[170],"packs":[172],"large":[174],"number":[175,243,255,267],"nearby":[177],"overlapping":[179],"same":[183,223,253],"\"compact":[187],"window\".":[188],"total,":[190],"O(n)":[193],"compact":[194,212,229,246],"windows":[195,213,230],"represent":[197],"words.":[207],"Next,":[208],"matched":[218,245],"if":[219],"they":[220],"have":[221],"min-hash.":[224],"The":[225],"rest":[226],"unmatched":[228],"removed.":[232],"Finally,":[233],"contained":[240],"by":[241],"window":[247],"pairs,":[248],"must":[250],"share":[251],"way,":[261],"avoids":[263],"enumerating":[264],"enormous":[266],"pairs.":[270],"Last":[271],"but":[272],"not":[273],"least,":[274],"make":[276],"reported":[278],"more":[281,285],"relevant":[282],"efficient,":[286],"show":[288,310],"how":[289],"support":[291],"practical":[294],"constraints":[295],"efficiently,":[296],"including":[297],"reporting":[298],"only":[299],"longest":[300],"near-duplicates":[301],"sentence-level":[303],"near-duplicates.":[304],"Experimental":[305],"results":[306],"on":[307],"real-world":[308],"datasets":[309],"that":[311],"significantly":[313],"outperforms":[314],"state-of-the-art":[316],"methods.":[319]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
