{"id":"https://openalex.org/W2056970273","doi":"https://doi.org/10.1017/s1351324906004293","title":"Segmentation and alignment of parallel text for statistical machine translation","display_name":"Segmentation and alignment of parallel text for statistical machine translation","publication_year":2006,"publication_date":"2006-07-06","ids":{"openalex":"https://openalex.org/W2056970273","doi":"https://doi.org/10.1017/s1351324906004293","mag":"2056970273"},"language":"en","primary_location":{"id":"doi:10.1017/s1351324906004293","is_oa":false,"landing_page_url":"https://doi.org/10.1017/s1351324906004293","pdf_url":null,"source":{"id":"https://openalex.org/S18088403","display_name":"Natural Language Engineering","issn_l":"1351-3249","issn":["1351-3249","1469-8110"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311721","host_organization_name":"Cambridge University Press","host_organization_lineage":["https://openalex.org/P4310311721","https://openalex.org/P4310311702"],"host_organization_lineage_names":["Cambridge University Press","University of Cambridge"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Natural Language Engineering","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5084723540","display_name":"Yonggang Deng","orcid":"https://orcid.org/0000-0001-8564-4988"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"YONGGANG DENG","raw_affiliation_strings":["Center for Language and Speech Processing, Department of Electrical and Computer Engineering, The Johns Hopkins University, 3400 N. Charles St., Baltimore, MD 21218, USA e-mail:","Center for Language and Speech Processing, Department of Electrical and Computer Engineering, The Johns Hopkins University, 3400 N. Charles St., Baltimore, MD 21218, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Center for Language and Speech Processing, Department of Electrical and Computer Engineering, The Johns Hopkins University, 3400 N. Charles St., Baltimore, MD 21218, USA e-mail:","institution_ids":["https://openalex.org/I145311948"]},{"raw_affiliation_string":"Center for Language and Speech Processing, Department of Electrical and Computer Engineering, The Johns Hopkins University, 3400 N. Charles St., Baltimore, MD 21218, USA","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101051347","display_name":"Shankar Kumar","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"SHANKAR KUMAR","raw_affiliation_strings":["Google Inc., 1600 Amphitheatre Parkway, Mountain View, CA 94043, USA e-mail:","Google Inc., 1600 Amphitheatre Parkway, Mountain View , CA , 94043 , USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Inc., 1600 Amphitheatre Parkway, Mountain View, CA 94043, USA e-mail:","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Google Inc., 1600 Amphitheatre Parkway, Mountain View , CA , 94043 , USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":null,"display_name":"WILLIAM BYRNE","orcid":null},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"WILLIAM BYRNE","raw_affiliation_strings":["Department of Engineering, Cambridge University, Trumpington Street, Cambridge CB2 1PZ, UK e-mail:","Department of Engineering, Cambridge University, Trumpington street, Cambridge CB2 1PZ, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Engineering, Cambridge University, Trumpington Street, Cambridge CB2 1PZ, UK e-mail:","institution_ids":["https://openalex.org/I241749"]},{"raw_affiliation_string":"Department of Engineering, Cambridge University, Trumpington street, Cambridge CB2 1PZ, UK","institution_ids":["https://openalex.org/I241749"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5084723540"],"corresponding_institution_ids":["https://openalex.org/I145311948"],"apc_list":null,"apc_paid":null,"fwci":5.5924,"has_fulltext":false,"cited_by_count":44,"citation_normalized_percentile":{"value":0.95503385,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":"13","issue":"3","first_page":"235","last_page":"260"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9883999824523926,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.9149922132492065},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.7671444416046143},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.6311559677124023},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.557527482509613},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5277795791625977},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.511809766292572},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.5049973130226135},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4991168975830078},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.4705241918563843},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4525827169418335},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4335963726043701},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.12702205777168274}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9149922132492065},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.7671444416046143},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.6311559677124023},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.557527482509613},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5277795791625977},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.511809766292572},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.5049973130226135},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4991168975830078},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4705241918563843},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4525827169418335},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4335963726043701},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.12702205777168274},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C105580179","wikidata":"https://www.wikidata.org/wiki/Q188928","display_name":"Messenger RNA","level":3,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1017/s1351324906004293","is_oa":false,"landing_page_url":"https://doi.org/10.1017/s1351324906004293","pdf_url":null,"source":{"id":"https://openalex.org/S18088403","display_name":"Natural Language Engineering","issn_l":"1351-3249","issn":["1351-3249","1469-8110"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311721","host_organization_name":"Cambridge University Press","host_organization_lineage":["https://openalex.org/P4310311721","https://openalex.org/P4310311702"],"host_organization_lineage_names":["Cambridge University Press","University of Cambridge"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Natural Language Engineering","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.75,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":15,"referenced_works":["https://openalex.org/W1510951099","https://openalex.org/W1819903106","https://openalex.org/W2038698865","https://openalex.org/W2091889711","https://openalex.org/W2101105183","https://openalex.org/W2103237065","https://openalex.org/W2119168550","https://openalex.org/W2120667384","https://openalex.org/W2134383396","https://openalex.org/W2138753018","https://openalex.org/W2156985047","https://openalex.org/W2352162662","https://openalex.org/W2626190081","https://openalex.org/W2949984266","https://openalex.org/W3085162807"],"related_works":["https://openalex.org/W2380075625","https://openalex.org/W3011059803","https://openalex.org/W2375873920","https://openalex.org/W3151736118","https://openalex.org/W2146114872","https://openalex.org/W2392060890","https://openalex.org/W4362495644","https://openalex.org/W4390549206","https://openalex.org/W2392760275","https://openalex.org/W2883671469"],"abstract_inverted_index":{"We":[0,20,135,165],"address":[1],"the":[2,22,43,67,86,98,101,115,126,155],"problem":[3,23],"of":[4,26,66,100,120,128,139,154],"extracting":[5],"bilingual":[6],"chunk":[7,116,121,169],"pairs":[8,122],"from":[9,133],"parallel":[10,68,77,102,156],"text":[11,32,78,103,157],"to":[12,60,84,110,113,162],"create":[13],"training":[14],"sets":[15],"for":[16],"statistical":[17],"machine":[18,129],"translation.":[19],"formulate":[21],"in":[24,94,175],"terms":[25],"a":[27,51,74,172],"stochastic":[28],"generative":[29],"process":[30],"over":[31],"translation":[33,130],"pairs,":[34],"and":[35],"derive":[36],"two":[37],"different":[38],"alignment":[39,45,55,65,79,170,177,182],"procedures":[40],"based":[41],"on":[42],"underlying":[44],"model.":[46],"The":[47,70,118],"first":[48,173],"procedure":[49,72,80,91],"is":[50,73,92],"now-standard":[52],"dynamic":[53],"programming":[54],"model":[56],"which":[57,81,107],"we":[58,82],"use":[59,83],"generate":[61],"an":[62],"initial":[63],"coarse":[64],"text.":[69],"second":[71],"divisive":[75,140],"clustering":[76,141],"refine":[85],"first-pass":[87],"alignments.":[88],"This":[89],"latter":[90],"novel":[93],"that":[95,158,168],"it":[96],"permits":[97],"segmentation":[99],"into":[104],"sub-sentence":[105],"units":[106],"are":[108,123],"allowed":[109],"be":[111,149,163],"reordered":[112],"improve":[114],"alignment.":[117],"quality":[119],"measured":[124],"by":[125,151],"performance":[127,147],"systems":[131],"trained":[132],"them.":[134],"show":[136,167],"practical":[137],"benefits":[138],"as":[142,144,171],"well":[143],"how":[145],"system":[146],"can":[148,178],"improved":[150],"exploiting":[152],"portions":[153],"otherwise":[159],"would":[160],"have":[161],"discarded.":[164],"also":[166],"step":[174],"word":[176,181],"significantly":[179],"reduce":[180],"error":[183],"rate.":[184]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":4},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":1},{"year":2016,"cited_by_count":3},{"year":2015,"cited_by_count":7},{"year":2014,"cited_by_count":2},{"year":2013,"cited_by_count":3},{"year":2012,"cited_by_count":2}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
