{"id":"https://openalex.org/W4417271840","doi":"https://doi.org/10.1145/3783981","title":"How Much Data in Low-resource Indian Languages is \"Sufficient' for Transfer Learning: A Comparative Study for POS Annotation","display_name":"How Much Data in Low-resource Indian Languages is \"Sufficient' for Transfer Learning: A Comparative Study for POS Annotation","publication_year":2025,"publication_date":"2025-12-12","ids":{"openalex":"https://openalex.org/W4417271840","doi":"https://doi.org/10.1145/3783981"},"language":"en","primary_location":{"id":"doi:10.1145/3783981","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3783981","pdf_url":null,"source":{"id":"https://openalex.org/S4306421405","display_name":"ACM Transactions on Asian and Low-Resource Language Information Processing","issn_l":"2375-4699","issn":["2375-4699","2375-4702"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Asian and Low-Resource Language Information Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5035050319","display_name":"Mohit Raj","orcid":"https://orcid.org/0000-0002-2080-9910"},"institutions":[{"id":"https://openalex.org/I147727580","display_name":"Dr. Bhim Rao Ambedkar University","ror":"https://ror.org/00k4ab982","country_code":"IN","type":"education","lineage":["https://openalex.org/I147727580"]},{"id":"https://openalex.org/I190765188","display_name":"Ambedkar University Delhi","ror":"https://ror.org/0039wpa60","country_code":"IN","type":"education","lineage":["https://openalex.org/I190765188"]}],"countries":["IN"],"is_corresponding":true,"raw_author_name":"Mohit Raj","raw_affiliation_strings":["Linguistics, Dr Bhim Rao Ambedkar University","Linguistics, Dr Bhim Rao Ambedkar University, Agra, India"],"raw_orcid":"https://orcid.org/0000-0002-2080-9910","affiliations":[{"raw_affiliation_string":"Linguistics, Dr Bhim Rao Ambedkar University","institution_ids":["https://openalex.org/I190765188"]},{"raw_affiliation_string":"Linguistics, Dr Bhim Rao Ambedkar University, Agra, India","institution_ids":["https://openalex.org/I147727580"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5014581579","display_name":"Ritesh Kumar","orcid":"https://orcid.org/0000-0002-5151-2546"},"institutions":[{"id":"https://openalex.org/I147727580","display_name":"Dr. Bhim Rao Ambedkar University","ror":"https://ror.org/00k4ab982","country_code":"IN","type":"education","lineage":["https://openalex.org/I147727580"]},{"id":"https://openalex.org/I4210111237","display_name":"BioRealm (United States)","ror":"https://ror.org/01rqg5073","country_code":"US","type":"company","lineage":["https://openalex.org/I4210111237"]},{"id":"https://openalex.org/I4210156156","display_name":"Office of Diversity and Inclusion","ror":"https://ror.org/04s3dsz85","country_code":"US","type":"government","lineage":["https://openalex.org/I1322918889","https://openalex.org/I4210156156"]}],"countries":["IN","US"],"is_corresponding":false,"raw_author_name":"Ritesh Kumar","raw_affiliation_strings":["Council for Diversity and Innovation","UnReaL-TecE LLP","Council for Diversity and Innovation, Agra, India","UnReaL-TecE LLP, Agra, India"],"raw_orcid":"https://orcid.org/0000-0002-5151-2546","affiliations":[{"raw_affiliation_string":"Council for Diversity and Innovation","institution_ids":["https://openalex.org/I4210156156"]},{"raw_affiliation_string":"UnReaL-TecE LLP","institution_ids":["https://openalex.org/I4210111237"]},{"raw_affiliation_string":"Council for Diversity and Innovation, Agra, India","institution_ids":["https://openalex.org/I4210156156"]},{"raw_affiliation_string":"UnReaL-TecE LLP, Agra, India","institution_ids":["https://openalex.org/I147727580"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5035050319"],"corresponding_institution_ids":["https://openalex.org/I147727580","https://openalex.org/I190765188"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.5076324,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"25","issue":"1","first_page":"1","last_page":"26"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13194","display_name":"ICT in Developing Communities","score":0.259799987077713,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13194","display_name":"ICT in Developing Communities","score":0.259799987077713,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.12520000338554382,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.10130000114440918,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.6463000178337097},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5583000183105469},{"id":"https://openalex.org/keywords/hindi","display_name":"Hindi","score":0.47679999470710754},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.4542999863624573},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.37229999899864197},{"id":"https://openalex.org/keywords/transfer","display_name":"Transfer (computing)","score":0.37229999899864197},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.34360000491142273},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.33559998869895935},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.3294000029563904}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8407999873161316},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6769999861717224},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.6463000178337097},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6406000256538391},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5583000183105469},{"id":"https://openalex.org/C519982507","wikidata":"https://www.wikidata.org/wiki/Q1568","display_name":"Hindi","level":2,"score":0.47679999470710754},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.4542999863624573},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.37229999899864197},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.37229999899864197},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.34360000491142273},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3433000147342682},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.33559998869895935},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3294000029563904},{"id":"https://openalex.org/C129792486","wikidata":"https://www.wikidata.org/wiki/Q1050419","display_name":"Language identification","level":3,"score":0.3174999952316284},{"id":"https://openalex.org/C2777938197","wikidata":"https://www.wikidata.org/wiki/Q7834022","display_name":"Transfer of training","level":2,"score":0.30649998784065247},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.3012999892234802},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.29670000076293945},{"id":"https://openalex.org/C94922259","wikidata":"https://www.wikidata.org/wiki/Q33215","display_name":"Constructed language","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C74672266","wikidata":"https://www.wikidata.org/wiki/Q815859","display_name":"Language acquisition","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C28006648","wikidata":"https://www.wikidata.org/wiki/Q6934509","display_name":"Multi-task learning","level":3,"score":0.26980000734329224},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.26440000534057617},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C77075516","wikidata":"https://www.wikidata.org/wiki/Q6027324","display_name":"Inductive transfer","level":5,"score":0.2572999894618988},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3783981","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3783981","pdf_url":null,"source":{"id":"https://openalex.org/S4306421405","display_name":"ACM Transactions on Asian and Low-Resource Language Information Processing","issn_l":"2375-4699","issn":["2375-4699","2375-4702"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Asian and Low-Resource Language Information Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W2039574266","https://openalex.org/W2098830640","https://openalex.org/W2250389961","https://openalex.org/W2963088995","https://openalex.org/W2963338481","https://openalex.org/W3035032094","https://openalex.org/W3035390927","https://openalex.org/W3194993673","https://openalex.org/W4280557709"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,19,35,59,162,176,215,225,245],"machine":[3],"learning":[4,7,17,30,125],"and":[5,12,21,37,77,105,109,139,145,183,209],"deep":[6],"have":[8],"demonstrated":[9],"the":[10,26,45,53,63,84,90,97,106,110,136,140,151,163,230,239,262,265,269,272],"applicability":[11,27],"utility":[13],"of":[14,28,47,55,80,112,217,232,251,264,268],"cross-lingual,":[15],"transfer":[16,29,124,277],"methods":[18,31],"low":[20],"zero-resource":[22],"scenarios.":[23],"We":[24,43],"explore":[25],"from":[32],"pre-trained":[33,203],"models":[34],"zero-shot":[36],"few-shot":[38],"scenarios":[39],"for":[40,71,95,103,126,172,189,220,275],"part-of-speech":[41,181],"tagging.":[42],"report":[44],"results":[46],"an":[48,222],"ablation":[49],"study":[50,85,130,152],"to":[51,122,249],"understand":[52],"impact":[54,263],"training":[56],"data":[57,93,161,196],"size":[58],"low-resource":[60,72,107,127,148,164,190,235],"languages":[61,73,243,270],"on":[62,255,271],"system\u2019s":[64],"performance.":[65],"Since":[66],"building":[67],"or":[68],"augmenting":[69],"datasets":[70],"is":[74,120,131,170,184,261],"tricky,":[75],"costly":[76],"a":[78,177,246,252,256],"lot":[79],"time":[81],"not":[82],"feasible,":[83],"provides":[86],"valuable":[87],"insights":[88],"into":[89],"expected":[91],"relative":[92],"requirements":[94,219],"both":[96],"high-resource":[98,137,168],"language":[99,102,108,138,169,191],"(the":[100],"source":[101],"transfer)":[104],"kind":[111],"performance":[113,175,224,247],"boost":[114],"one":[115,119],"could":[116],"expect":[117],"when":[118],"planning":[121],"use":[123,194],"languages.":[128,149],"The":[129],"conducted":[132],"with":[133],"Hindi":[134],"as":[135,165,167],"three":[141],"related":[142],"languages\u2014Magahi,":[143],"Bhojpuri,":[144],"Braj\u2014as":[146],"extremely":[147],"Overall,":[150],"addresses":[153],"four":[154],"broad":[155],"research":[156],"questions:":[157],"(a)":[158],"How":[159],"much":[160],"well":[166],"\u201csufficient\u201d":[171],"attaining":[173,221],"optimum":[174,223],"downstream":[178],"task":[179],"like":[180],"annotation,":[182],"there":[185],"any":[186,212],"specific":[187],"advantage":[188,214],"if":[192],"we":[193],"multilingual":[195,202],"during":[197],"fine-tuning?":[198],"(b)":[199],"Do":[200],"different":[201],"models,":[204],"specifically":[205],"multilingual-BERT,":[206],"multilingual-DistilBERT,":[207],"XLM-RoBERTa,":[208],"MuRIL,":[210],"offer":[211],"significant":[213],"terms":[216],"dataset":[218,240,273],"Indian":[226],"languages?":[227],"(c)":[228],"In":[229],"case":[231],"multiple":[233,242],"closely-related":[234],"languages,":[236],"does":[237],"distributing":[238],"across":[241],"result":[244],"comparable":[248],"that":[250],"system":[253],"trained":[254],"single":[257],"language?":[258],"(d)":[259],"What":[260],"typological":[266],"similarity":[267],"requirement":[274],"successful":[276],"learning?":[278]},"counts_by_year":[],"updated_date":"2026-01-11T23:08:45.486102","created_date":"2025-12-12T00:00:00"}
