{"id":"https://openalex.org/W7115561181","doi":"https://doi.org/10.48550/arxiv.2512.11074","title":"MultiScript30k: Leveraging Multilingual Embeddings to Extend Cross Script Parallel Data","display_name":"MultiScript30k: Leveraging Multilingual Embeddings to Extend Cross Script Parallel Data","publication_year":2025,"publication_date":"2025-12-11","ids":{"openalex":"https://openalex.org/W7115561181","doi":"https://doi.org/10.48550/arxiv.2512.11074"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2512.11074","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.11074","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2512.11074","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Driggers-Ellis, Christopher","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Driggers-Ellis, Christopher","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Brinkley, Detravious","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brinkley, Detravious","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chen, Ray","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Ray","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Dhawan, Aashish","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dhawan, Aashish","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Daisy Zhe","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Daisy Zhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Grant, Christan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Grant, Christan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9643999934196472,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9643999934196472,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.011699999682605267,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.0020000000949949026,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scripting-language","display_name":"Scripting language","score":0.6836000084877014},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.6001999974250793},{"id":"https://openalex.org/keywords/cosine-similarity","display_name":"Cosine similarity","score":0.59170001745224},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5687000155448914},{"id":"https://openalex.org/keywords/parallel-corpora","display_name":"Parallel corpora","score":0.5067999958992004},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.49729999899864197},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.44110000133514404}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7541999816894531},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.6836000084877014},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6710000038146973},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6140999794006348},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.6001999974250793},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.59170001745224},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5687000155448914},{"id":"https://openalex.org/C2985367798","wikidata":"https://www.wikidata.org/wiki/Q1346592","display_name":"Parallel corpora","level":3,"score":0.5067999958992004},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.49729999899864197},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.44110000133514404},{"id":"https://openalex.org/C2778029271","wikidata":"https://www.wikidata.org/wiki/Q5421931","display_name":"Extension (predicate logic)","level":2,"score":0.4350000023841858},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.37049999833106995},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.3686000108718872},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.3644999861717224},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.3158000111579895},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3003000020980835},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.26510000228881836},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.2522999942302704}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2512.11074","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.11074","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2512.11074","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.11074","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8006231784820557,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multi30k":[0,62,75,101,117,147,175],"is":[1,24,88,170],"frequently":[2],"cited":[3],"in":[4,68,107,134,192],"the":[5,60,78,113,173,195],"multimodal":[6],"machine":[7],"translation":[8,189],"(MMT)":[9],"literature,":[10],"offering":[11],"parallel":[12],"text":[13],"data":[14],"for":[15,104,163],"training":[16],"and":[17,32,86,128,141,156,178],"fine-tuning":[18],"deep":[19],"learning":[20],"models.":[21],"However,":[22],"it":[23],"limited":[25],"to":[26,40,73,172,194],"four":[27],"languages:":[28],"Czech,":[29],"English,":[30],"French,":[31],"German.":[33],"This":[34],"restriction":[35],"has":[36,56],"led":[37],"many":[38],"researchers":[39],"focus":[41],"their":[42],"investigations":[43],"only":[44,64],"on":[45,53],"these":[46,94],"languages.":[47],"As":[48],"a":[49,99,188],"result,":[50],"MMT":[51],"research":[52],"diverse":[54],"languages":[55,67,106,165],"been":[57],"stalled":[58],"because":[59],"official":[61],"dataset":[63,102,122],"represents":[65],"European":[66],"Latin":[69],"scripts.":[70],"Previous":[71],"efforts":[72],"extend":[74],"exist,":[76],"but":[77,203],"list":[79],"of":[80,116,124,131,185,190],"supported":[81,166],"languages,":[82],"represented":[83],"language":[84],"families,":[85],"scripts":[87],"still":[89],"very":[90],"short.":[91],"To":[92],"address":[93],"issues,":[95],"we":[96],"propose":[97],"MultiScript30k,":[98],"new":[100],"extension":[103,148],"global":[105],"various":[108],"scripts,":[109],"created":[110],"by":[111],"translating":[112],"English":[114],"version":[115],"(Multi30k-En)":[118],"using":[119],"NLLB200-3.3B.":[120],"The":[121],"consists":[123],"over":[125],"\\(30000\\)":[126],"sentences":[127,133],"provides":[129],"translations":[130],"all":[132,164],"Multi30k-En":[135,191],"into":[136],"Ar,":[137],"Es,":[138],"Uk,":[139],"Zh\\_Hans":[140],"Zh\\_Hant.":[142],"Similarity":[143],"analysis":[144],"shows":[145],"that":[146],"consistently":[149],"achieves":[150],"greater":[151,207],"than":[152,161,208],"\\(0.8\\)":[153],"cosine":[154],"similarity":[155],"symmetric":[157],"KL":[158],"divergence":[159],"less":[160],"\\(0.000251\\)":[162],"except":[167],"Zh\\_Hant":[168],"which":[169],"comparable":[171],"previous":[174],"extensions":[176],"ArEnMulti30k":[177,198],"Multi30k-Uk.":[179],"COMETKiwi":[180],"scores":[181,199,205],"reveal":[182],"mixed":[183],"assessments":[184],"MultiScript30k":[186],"as":[187],"comparison":[193],"related":[196],"work.":[197],"nearly":[200],"equal":[201],"MultiScript30k-Ar,":[202],"Multi30k-Uk":[204],"$6.4\\%$":[206],"MultiScript30k-Uk":[209],"per":[210],"split.":[211]},"counts_by_year":[],"updated_date":"2025-12-16T23:48:00.217561","created_date":"2025-12-16T00:00:00"}
