{"id":"https://openalex.org/W7117130829","doi":"https://doi.org/10.48550/arxiv.2512.18608","title":"A Comparative Study of Light-weight Language Models for PII Masking and their Deployment for Real Conversational Texts","display_name":"A Comparative Study of Light-weight Language Models for PII Masking and their Deployment for Real Conversational Texts","publication_year":2025,"publication_date":"2025-12-21","ids":{"openalex":"https://openalex.org/W7117130829","doi":"https://doi.org/10.48550/arxiv.2512.18608"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2512.18608","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.18608","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2512.18608","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5121131232","display_name":"Prabigya Acharya","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Acharya, Prabigya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5121163907","display_name":"Liza Shrestha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shrestha, Liza","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5121131232"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.24050000309944153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.24050000309944153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11045","display_name":"Privacy, Security, and Data Protection","score":0.12540000677108765,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.08110000193119049,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/masking","display_name":"Masking (illustration)","score":0.6707000136375427},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6284000277519226},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6281999945640564},{"id":"https://openalex.org/keywords/offset","display_name":"Offset (computer science)","score":0.6234999895095825},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.5200999975204468},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5149999856948853},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.510699987411499},{"id":"https://openalex.org/keywords/standardization","display_name":"Standardization","score":0.4528999924659729}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7402999997138977},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.6707000136375427},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6284000277519226},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6281999945640564},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.6234999895095825},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5652999877929688},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.5200999975204468},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5149999856948853},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.510699987411499},{"id":"https://openalex.org/C188087704","wikidata":"https://www.wikidata.org/wiki/Q369577","display_name":"Standardization","level":2,"score":0.4528999924659729},{"id":"https://openalex.org/C100660578","wikidata":"https://www.wikidata.org/wiki/Q18733","display_name":"Recall","level":2,"score":0.4514000117778778},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4415999948978424},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41499999165534973},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.3440000116825104},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.3075999915599823},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.3012999892234802},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.29660001397132874},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.2888000011444092},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.27630001306533813},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.2754000127315521},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27079999446868896},{"id":"https://openalex.org/C2987487971","wikidata":"https://www.wikidata.org/wiki/Q8096","display_name":"Lexical access","level":3,"score":0.26589998602867126},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.2533000111579895}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2512.18608","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.18608","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2512.18608","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.18608","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Automated":[0],"masking":[1,22,104,187],"of":[2,33],"Personally":[3],"Identifiable":[4],"Information":[5],"(PII)":[6],"is":[7],"critical":[8],"for":[9,102,155],"privacy-preserving":[10],"conversational":[11,136],"systems.":[12],"While":[13],"current":[14],"frontier":[15,100,195],"large":[16],"language":[17],"models":[18,36,95,182],"demonstrate":[19],"strong":[20],"PII":[21,70,75,103,123,157,186],"capabilities,":[23],"concerns":[24,192],"about":[25],"data":[26,190],"handling":[27,191],"and":[28,44,50,69,77,83,88,117,143,176],"computational":[29,177],"costs":[30],"motivate":[31],"exploration":[32],"whether":[34],"lightweight":[35,94,181],"can":[37,183],"achieve":[38,96],"comparable":[39,98],"performance.":[40],"We":[41,60],"compare":[42],"encoder-decoder":[43],"decoder-only":[45],"architectures":[46],"by":[47],"fine-tuning":[48],"T5-small":[49],"Mistral-Instruct-v0.3":[51],"on":[52,160],"English":[53],"datasets":[54],"constructed":[55],"from":[56],"the":[57],"AI4Privacy":[58],"benchmark.":[59],"create":[61],"different":[62],"dataset":[63],"variants":[64],"to":[65,99],"study":[66],"label":[67],"standardization":[68],"representation,":[71],"covering":[72],"24":[73],"standardized":[74],"categories":[76],"higher-granularity":[78],"settings.":[79],"Evaluation":[80,159],"using":[81],"entity-level":[82],"character-level":[84],"metrics,":[85],"type":[86],"accuracy,":[87,174],"exact":[89],"match":[90],"shows":[91],"that":[92,180],"both":[93],"performance":[97,110,164],"LLMs":[101],"tasks.":[105],"Label":[106],"normalization":[107],"consistently":[108],"improves":[109],"across":[111,122],"architectures.":[112],"Mistral":[113],"achieves":[114],"higher":[115,128],"F1":[116],"recall":[118],"with":[119,194],"greater":[120],"robustness":[121],"types":[124],"but":[125],"incurs":[126],"significantly":[127],"generation":[129],"latency.":[130],"T5,":[131],"while":[132,188],"less":[133],"robust":[134],"in":[135,150],"text,":[137],"offers":[138],"more":[139],"controllable":[140],"structured":[141],"outputs":[142],"lower":[144],"inference":[145],"cost,":[146],"motivating":[147],"its":[148],"use":[149],"a":[151],"real-time":[152],"Discord":[153],"bot":[154],"real-world":[156],"redaction.":[158],"live":[161],"messages":[162],"reveals":[163],"degradation":[165],"under":[166],"informal":[167],"inputs.":[168],"These":[169],"results":[170],"clarify":[171],"trade-offs":[172],"between":[173],"robustness,":[175],"efficiency,":[178],"demonstrating":[179],"provide":[184],"effective":[185],"addressing":[189],"associated":[193],"LLMs.":[196]},"counts_by_year":[],"updated_date":"2025-12-24T23:14:05.333182","created_date":"2025-12-24T00:00:00"}
