{"id":"https://openalex.org/W2047295649","doi":"https://doi.org/10.1162/089120103322711578","title":"The Web as a Parallel Corpus","display_name":"The Web as a Parallel Corpus","publication_year":2003,"publication_date":"2003-09-01","ids":{"openalex":"https://openalex.org/W2047295649","doi":"https://doi.org/10.1162/089120103322711578","mag":"2047295649"},"language":"en","primary_location":{"id":"doi:10.1162/089120103322711578","is_oa":false,"landing_page_url":"https://doi.org/10.1162/089120103322711578","pdf_url":null,"source":{"id":"https://openalex.org/S155526855","display_name":"Computational Linguistics","issn_l":"0891-2017","issn":["0891-2017","1530-9312"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Linguistics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://hdl.handle.net/1903/1213","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059946729","display_name":"Philip Resnik","orcid":"https://orcid.org/0000-0002-6130-8602"},"institutions":[{"id":"https://openalex.org/I66946132","display_name":"University of Maryland, College Park","ror":"https://ror.org/047s2c258","country_code":"US","type":"education","lineage":["https://openalex.org/I66946132"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Philip Resnik","raw_affiliation_strings":["University of Maryland, Department of Linguistics and Institute for Advanced Computer Studies, University of Maryland, College Park, MD 20742"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Maryland, Department of Linguistics and Institute for Advanced Computer Studies, University of Maryland, College Park, MD 20742","institution_ids":["https://openalex.org/I66946132"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5088517824","display_name":"Noah A. Smith","orcid":"https://orcid.org/0000-0002-2310-6380"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]},{"id":"https://openalex.org/I2799853436","display_name":"Johns Hopkins Medicine","ror":"https://ror.org/037zgn354","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I2799853436"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Noah A. Smith","raw_affiliation_strings":["Johns Hopkins University, Department of Computer Science and Center for Language and Speech Processing, Johns Hopkins University, Baltimore, MD 21218"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Johns Hopkins University, Department of Computer Science and Center for Language and Speech Processing, Johns Hopkins University, Baltimore, MD 21218","institution_ids":["https://openalex.org/I145311948","https://openalex.org/I2799853436"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5059946729"],"corresponding_institution_ids":["https://openalex.org/I66946132"],"apc_list":null,"apc_paid":null,"fwci":34.198,"has_fulltext":false,"cited_by_count":525,"citation_normalized_percentile":{"value":0.99779343,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":"29","issue":"3","first_page":"349","last_page":"380"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.9872000217437744,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9855999946594238,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8867483139038086},{"id":"https://openalex.org/keywords/the-internet","display_name":"The Internet","score":0.6416000127792358},{"id":"https://openalex.org/keywords/parallel-corpora","display_name":"Parallel corpora","score":0.6034821271896362},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5411703586578369},{"id":"https://openalex.org/keywords/equivalence","display_name":"Equivalence (formal languages)","score":0.5201799869537354},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.4795241355895996},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.45515674352645874},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.43054550886154175},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4279819428920746},{"id":"https://openalex.org/keywords/web-mining","display_name":"Web mining","score":0.41733503341674805},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.41676488518714905},{"id":"https://openalex.org/keywords/text-corpus","display_name":"Text corpus","score":0.41246476769447327},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.3686223030090332},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.3524126708507538},{"id":"https://openalex.org/keywords/web-service","display_name":"Web service","score":0.20569029450416565},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.08107250928878784}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8867483139038086},{"id":"https://openalex.org/C110875604","wikidata":"https://www.wikidata.org/wiki/Q75","display_name":"The Internet","level":2,"score":0.6416000127792358},{"id":"https://openalex.org/C2985367798","wikidata":"https://www.wikidata.org/wiki/Q1346592","display_name":"Parallel corpora","level":3,"score":0.6034821271896362},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5411703586578369},{"id":"https://openalex.org/C2780069185","wikidata":"https://www.wikidata.org/wiki/Q7977945","display_name":"Equivalence (formal languages)","level":2,"score":0.5201799869537354},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.4795241355895996},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45515674352645874},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.43054550886154175},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4279819428920746},{"id":"https://openalex.org/C197046077","wikidata":"https://www.wikidata.org/wiki/Q785337","display_name":"Web mining","level":3,"score":0.41733503341674805},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.41676488518714905},{"id":"https://openalex.org/C2474386","wikidata":"https://www.wikidata.org/wiki/Q461183","display_name":"Text corpus","level":2,"score":0.41246476769447327},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.3686223030090332},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.3524126708507538},{"id":"https://openalex.org/C35578498","wikidata":"https://www.wikidata.org/wiki/Q193424","display_name":"Web service","level":2,"score":0.20569029450416565},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.08107250928878784},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.1162/089120103322711578","is_oa":false,"landing_page_url":"https://doi.org/10.1162/089120103322711578","pdf_url":null,"source":{"id":"https://openalex.org/S155526855","display_name":"Computational Linguistics","issn_l":"0891-2017","issn":["0891-2017","1530-9312"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Linguistics","raw_type":"journal-article"},{"id":"pmh:oai:drum.lib.umd.edu:1903/1213","is_oa":true,"landing_page_url":"http://hdl.handle.net/1903/1213","pdf_url":"http://hdl.handle.net/1903/1213","source":{"id":"https://openalex.org/S4306402644","display_name":"Digital Repository at the University of Maryland (University of Maryland College Park)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I66946132","host_organization_name":"University of Maryland, College Park","host_organization_lineage":["https://openalex.org/I66946132"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Technical Report"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.19.3952","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.3952","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.cs.umd.edu/Library/TRs/CS-TR-4381/CS-TR-4381.pdf","raw_type":"text"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.62.6192","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.62.6192","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://acl.ldc.upenn.edu/J/J03/J03-3002.pdf","raw_type":"text"},{"id":"pmh:oai:doaj.org/article:7cb469e0ad874b1ebcd10afc9c97dc6d","is_oa":false,"landing_page_url":"https://doaj.org/article/7cb469e0ad874b1ebcd10afc9c97dc6d","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Computational Linguistics, Vol 29, Iss 3 (2021)","raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:drum.lib.umd.edu:1903/1213","is_oa":true,"landing_page_url":"http://hdl.handle.net/1903/1213","pdf_url":"http://hdl.handle.net/1903/1213","source":{"id":"https://openalex.org/S4306402644","display_name":"Digital Repository at the University of Maryland (University of Maryland College Park)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I66946132","host_organization_name":"University of Maryland, College Park","host_organization_lineage":["https://openalex.org/I66946132"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Technical Report"},"sustainable_development_goals":[{"score":0.6899999976158142,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W2047295649.pdf"},"referenced_works_count":41,"referenced_works":["https://openalex.org/W157432847","https://openalex.org/W200075660","https://openalex.org/W1496351660","https://openalex.org/W1502581825","https://openalex.org/W1533946607","https://openalex.org/W1554769004","https://openalex.org/W1581740421","https://openalex.org/W1588612820","https://openalex.org/W1608419360","https://openalex.org/W1636405317","https://openalex.org/W1647729745","https://openalex.org/W1969178697","https://openalex.org/W1977545325","https://openalex.org/W1995875735","https://openalex.org/W2007709031","https://openalex.org/W2012268403","https://openalex.org/W2016630033","https://openalex.org/W2032773168","https://openalex.org/W2041404167","https://openalex.org/W2048679005","https://openalex.org/W2079442239","https://openalex.org/W2097333193","https://openalex.org/W2101210369","https://openalex.org/W2107330157","https://openalex.org/W2108997961","https://openalex.org/W2114930830","https://openalex.org/W2132957691","https://openalex.org/W2137854946","https://openalex.org/W2138753018","https://openalex.org/W2141325213","https://openalex.org/W2144810223","https://openalex.org/W2145080939","https://openalex.org/W2152565070","https://openalex.org/W2154124206","https://openalex.org/W2154384676","https://openalex.org/W2172167844","https://openalex.org/W2883783597","https://openalex.org/W3104029765","https://openalex.org/W4241850027","https://openalex.org/W4249159365","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W2786253471","https://openalex.org/W2604275745","https://openalex.org/W2986030184","https://openalex.org/W2104907655","https://openalex.org/W2985215540","https://openalex.org/W4307459710","https://openalex.org/W3155572818","https://openalex.org/W3175595715","https://openalex.org/W3152052241","https://openalex.org/W3108641831"],"abstract_inverted_index":{"Parallel":[0],"corpora":[1],"have":[2],"become":[3],"an":[4],"essential":[5],"resource":[6],"for":[7,26,87,114],"work":[8,21],"in":[9,106],"multilingual":[10],"natural":[11],"language":[12,117],"processing.":[13],"In":[14],"this":[15],"article,":[16],"we":[17],"report":[18],"on":[19,30,59,94],"our":[20],"using":[22],"the":[23,31,37,53,78,84,92,99,107],"STRAND":[24],"system":[25,79],"mining":[27,88],"parallel":[28,89,112],"text":[29,90],"World":[32],"Wide":[33],"Web,":[34],"first":[35],"reviewing":[36],"original":[38],"algorithm":[39],"and":[40,42,75],"results":[41],"then":[43],"presenting":[44],"a":[45,68,95,110,115],"set":[46],"of":[47,55,62,72,77,83,101,109],"significant":[48,111],"enhancements.":[49],"These":[50],"enhancements":[51],"include":[52],"use":[54],"supervised":[56],"learning":[57],"based":[58],"structural":[60],"features":[61],"documents":[63],"to":[64,80],"improve":[65],"classification":[66],"performance,":[67],"new":[69],"content-based":[70],"measure":[71],"translational":[73],"equivalence,":[74],"adaptation":[76],"take":[81],"advantage":[82],"Internet":[85],"Archive":[86],"from":[91],"Web":[93],"large":[96],"scale.":[97],"Finally,":[98],"value":[100],"these":[102],"techniques":[103],"is":[104],"demonstrated":[105],"construction":[108],"corpus":[113],"low-density":[116],"pair.":[118]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":6},{"year":2022,"cited_by_count":7},{"year":2021,"cited_by_count":21},{"year":2020,"cited_by_count":21},{"year":2019,"cited_by_count":21},{"year":2018,"cited_by_count":14},{"year":2017,"cited_by_count":22},{"year":2016,"cited_by_count":29},{"year":2015,"cited_by_count":21},{"year":2014,"cited_by_count":36},{"year":2013,"cited_by_count":27},{"year":2012,"cited_by_count":44}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
