{"id":"https://openalex.org/W7129079422","doi":"https://doi.org/10.48550/arxiv.2602.12937","title":"Curriculum Learning and Pseudo-Labeling Improve the Generalization of Multi-Label Arabic Dialect Identification Models","display_name":"Curriculum Learning and Pseudo-Labeling Improve the Generalization of Multi-Label Arabic Dialect Identification Models","publication_year":2026,"publication_date":"2026-02-12","ids":{"openalex":"https://openalex.org/W7129079422","doi":"https://doi.org/10.48550/arxiv.2602.12937"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.12937","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.12937","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.12937","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126132710","display_name":"Ali Mekky","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Mekky, Ali","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5118628339","display_name":"Mohamed El Zeftawy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeftawy, Mohamed El","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126167333","display_name":"Lara Hassan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hassan, Lara","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126116954","display_name":"Amr Keleg","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Keleg, Amr","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126082577","display_name":"Preslav Nakov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nakov, Preslav","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5126132710"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.32850000262260437,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.32850000262260437,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.2379000037908554,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.08760000020265579,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.635699987411499},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.5634999871253967},{"id":"https://openalex.org/keywords/arabic","display_name":"Arabic","score":0.5485000014305115},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.43860000371932983},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.3887999951839447},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.36910000443458557},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.35690000653266907},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.3431999981403351}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7103999853134155},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.635699987411499},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.633899986743927},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6261000037193298},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.5634999871253967},{"id":"https://openalex.org/C96455323","wikidata":"https://www.wikidata.org/wiki/Q13955","display_name":"Arabic","level":2,"score":0.5485000014305115},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.43860000371932983},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.3887999951839447},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.36910000443458557},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.36329999566078186},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.35690000653266907},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.3431999981403351},{"id":"https://openalex.org/C66905080","wikidata":"https://www.wikidata.org/wiki/Q17005494","display_name":"Binary classification","level":3,"score":0.3343000113964081},{"id":"https://openalex.org/C2778243841","wikidata":"https://www.wikidata.org/wiki/Q56467","display_name":"Modern Standard Arabic","level":3,"score":0.33410000801086426},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.3301999866962433},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3100000023841858},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.28130000829696655},{"id":"https://openalex.org/C166955791","wikidata":"https://www.wikidata.org/wiki/Q629579","display_name":"Macro","level":2,"score":0.2802000045776367},{"id":"https://openalex.org/C519536355","wikidata":"https://www.wikidata.org/wiki/Q21021151","display_name":"Repurposing","level":2,"score":0.27149999141693115},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.26429998874664307},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.25679999589920044}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.12937","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.12937","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.12937","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.12937","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8664416074752808,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Being":[0],"modeled":[1],"as":[2,23,77,81],"a":[3,8,24,95,123,147],"single-label":[4,36,51],"classification":[5,26],"task":[6],"for":[7,44,64,155],"long":[9],"time,":[10],"recent":[11],"work":[12],"has":[13],"argued":[14],"that":[15,56],"Arabic":[16,66,115],"Dialect":[17,67],"Identification":[18,68],"(ADI)":[19],"should":[20],"be":[21,84],"framed":[22],"multi-label":[25,41,96,101,125],"task.":[27],"However,":[28],"ADI":[29,52],"remains":[30],"constrained":[31],"by":[32,98,113],"the":[33,57,72,114,139,156],"availability":[34],"of":[35,74,117,150],"datasets,":[37],"with":[38,110,132],"no":[39],"large-scale":[40],"resources":[42],"available":[43,165],"training.":[45],"By":[46],"analyzing":[47],"models":[48],"trained":[49],"on":[50],"data,":[53],"we":[54,93,121],"show":[55],"main":[58],"difficulty":[59],"in":[60,71,86],"repurposing":[61],"such":[62],"datasets":[63],"Multi-Label":[65],"(MLADI)":[69],"lies":[70],"selection":[73],"negative":[75,82],"samples,":[76],"many":[78],"sentences":[79],"treated":[80],"could":[83],"acceptable":[85],"multiple":[87],"dialects.":[88],"To":[89],"address":[90],"these":[91],"issues,":[92],"construct":[94],"dataset":[97],"generating":[99],"automatic":[100],"annotations":[102],"using":[103,127],"GPT-4o":[104],"and":[105,135,162],"binary":[106],"dialect":[107],"acceptability":[108],"classifiers,":[109],"aggregation":[111],"guided":[112],"Level":[116],"Dialectness":[118],"(ALDi).":[119],"Afterward,":[120],"train":[122],"BERT-based":[124],"classifier":[126],"curriculum":[128],"learning":[129],"strategies":[130],"aligned":[131],"dialectal":[133],"complexity":[134],"label":[136],"cardinality.":[137],"On":[138],"MLADI":[140],"leaderboard,":[141],"our":[142],"best-performing":[143],"LAHJATBERT":[144],"model":[145],"achieves":[146],"macro":[148],"F1":[149],"0.69,":[151],"compared":[152],"to":[153],"0.55":[154],"strongest":[157],"previously":[158],"reported":[159],"system.":[160],"Code":[161],"data":[163],"are":[164],"at":[166],"https://mohamedalaa9.github.io/lahjatbert/.":[167]},"counts_by_year":[],"updated_date":"2026-02-17T06:10:05.244387","created_date":"2026-02-17T00:00:00"}
