{"id":"https://openalex.org/W4320024106","doi":"https://doi.org/10.1109/bigdata55660.2022.10020486","title":"An Empirical Comparison of DistilBERT, Longformer and Logistic Regression for Predictive Coding","display_name":"An Empirical Comparison of DistilBERT, Longformer and Logistic Regression for Predictive Coding","publication_year":2022,"publication_date":"2022-12-17","ids":{"openalex":"https://openalex.org/W4320024106","doi":"https://doi.org/10.1109/bigdata55660.2022.10020486"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata55660.2022.10020486","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/bigdata55660.2022.10020486","pdf_url":null,"source":{"id":"https://openalex.org/S4363607709","display_name":"2022 IEEE International Conference on Big Data (Big Data)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052463733","display_name":"Fusheng Wei","orcid":"https://orcid.org/0009-0000-5881-5307"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Fusheng Wei","raw_affiliation_strings":["Ankura,Data &#x0026; Technology,Washington,DC,USA"],"affiliations":[{"raw_affiliation_string":"Ankura,Data &#x0026; Technology,Washington,DC,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053350610","display_name":"Jingchao Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jingchao Yang","raw_affiliation_strings":["Ankura,Data &#x0026; Technology,Washington,DC,USA"],"affiliations":[{"raw_affiliation_string":"Ankura,Data &#x0026; Technology,Washington,DC,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101453263","display_name":"Qiang Mao","orcid":"https://orcid.org/0000-0002-9382-6527"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiang Mao","raw_affiliation_strings":["Ankura,Data &#x0026; Technology,Washington,DC,USA"],"affiliations":[{"raw_affiliation_string":"Ankura,Data &#x0026; Technology,Washington,DC,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101414673","display_name":"Han Qin","orcid":"https://orcid.org/0000-0001-9776-5007"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han Qin","raw_affiliation_strings":["Ankura,Data &#x0026; Technology,Washington,DC,USA"],"affiliations":[{"raw_affiliation_string":"Ankura,Data &#x0026; Technology,Washington,DC,USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101619458","display_name":"A. D\u0105browski","orcid":"https://orcid.org/0009-0004-5689-2500"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Adam Dabrowski","raw_affiliation_strings":["Ankura,Data &#x0026; Technology,Washington,DC,USA"],"affiliations":[{"raw_affiliation_string":"Ankura,Data &#x0026; Technology,Washington,DC,USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5052463733"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.4185,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.60868183,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"3336","last_page":"3340"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8225749731063843},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6908560991287231},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.6147186160087585},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.5702476501464844},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5574293732643127},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.5375461578369141},{"id":"https://openalex.org/keywords/logistic-regression","display_name":"Logistic regression","score":0.4989614486694336},{"id":"https://openalex.org/keywords/regression","display_name":"Regression","score":0.49792003631591797},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4894911050796509},{"id":"https://openalex.org/keywords/server","display_name":"Server","score":0.4876011908054352},{"id":"https://openalex.org/keywords/empirical-research","display_name":"Empirical research","score":0.4741779565811157},{"id":"https://openalex.org/keywords/predictive-coding","display_name":"Predictive coding","score":0.4596549868583679},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.45462536811828613},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4102441668510437},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.1017879843711853},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.09011703729629517},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.0684593915939331}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8225749731063843},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6908560991287231},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.6147186160087585},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5702476501464844},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5574293732643127},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.5375461578369141},{"id":"https://openalex.org/C151956035","wikidata":"https://www.wikidata.org/wiki/Q1132755","display_name":"Logistic regression","level":2,"score":0.4989614486694336},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.49792003631591797},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4894911050796509},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.4876011908054352},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.4741779565811157},{"id":"https://openalex.org/C2778061373","wikidata":"https://www.wikidata.org/wiki/Q1315146","display_name":"Predictive coding","level":3,"score":0.4596549868583679},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.45462536811828613},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4102441668510437},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.1017879843711853},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.09011703729629517},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0684593915939331},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/bigdata55660.2022.10020486","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/bigdata55660.2022.10020486","pdf_url":null,"source":{"id":"https://openalex.org/S4363607709","display_name":"2022 IEEE International Conference on Big Data (Big Data)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.49000000953674316}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W2413512986","https://openalex.org/W2585689263","https://openalex.org/W2896457183","https://openalex.org/W2912267070","https://openalex.org/W2978017171","https://openalex.org/W2995674643","https://openalex.org/W3015468748","https://openalex.org/W3138482863","https://openalex.org/W4205757056","https://openalex.org/W6649899227","https://openalex.org/W6755207826"],"related_works":["https://openalex.org/W3131673289","https://openalex.org/W4312200629","https://openalex.org/W3133293092","https://openalex.org/W4213299466","https://openalex.org/W4318957922","https://openalex.org/W3031818154","https://openalex.org/W2960456850","https://openalex.org/W4318834068","https://openalex.org/W4281382123","https://openalex.org/W4312685930"],"abstract_inverted_index":{"In":[0,73],"recent":[1],"years,":[2],"transformer-based,":[3],"large-scale":[4],"language":[5],"models":[6,16,40],"have":[7],"greatly":[8],"advanced":[9],"deep":[10,86],"learning":[11,19,87],"in":[12,25],"NLP":[13,24],"tasks.":[14],"These":[15],"allow":[17],"transfer":[18],"to":[20,28,114,131],"be":[21],"performed":[22],"on":[23,60],"similar":[26],"methods":[27,88],"what":[29],"was":[30],"previously":[31],"done":[32],"for":[33,95],"computer":[34],"vision.":[35],"Two":[36],"of":[37,52,68,80,83,135],"the":[38,46,64,81,84,121,133],"latest":[39],"are":[41],"DistilBERT":[42],"and":[43,58,63],"Longformer":[44,110,136],"--":[45],"former":[47],"is":[48,127,144],"a":[49,91,138],"distilled":[50],"version":[51],"BERT,":[53],"which":[54],"enables":[55],"faster":[56],"training":[57],"inferencing":[59],"off-cloud":[61],"servers,":[62],"latter":[65],"offers":[66],"capability":[67],"working":[69],"with":[70,90,120],"long":[71],"texts.":[72],"this":[74],"paper,":[75],"we":[76],"study":[77,107],"empirical":[78],"comparisons":[79],"effectiveness":[82],"two":[85,123],"along":[89],"logistical":[92],"regression":[93],"method":[94,140],"text":[96],"classification,":[97],"using":[98],"three":[99],"real-world":[100],"datasets":[101],"from":[102],"legal":[103],"document":[104],"reviews.":[105],"The":[106],"shows":[108],"that":[109],"performs":[111],"better":[112],"(up":[113],"10%)":[115],"than":[116],"or":[117],"at":[118],"par":[119],"other":[122],"methods.":[124],"Cross-dataset":[125],"evaluation":[126],"leveraged":[128],"as":[129,137],"well":[130],"validate":[132],"performance":[134],"viable":[139],"when":[141],"labeled":[142],"data":[143],"not":[145],"available.":[146]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2023,"cited_by_count":4}],"updated_date":"2026-03-27T14:29:43.386196","created_date":"2025-10-10T00:00:00"}
