{"id":"https://openalex.org/W2257692640","doi":"https://doi.org/10.1002/sam.11323","title":"Conducting sparse feature selection on arbitrarily long phrases in text corpora with a focus on interpretability","display_name":"Conducting sparse feature selection on arbitrarily long phrases in text corpora with a focus on interpretability","publication_year":2016,"publication_date":"2016-07-18","ids":{"openalex":"https://openalex.org/W2257692640","doi":"https://doi.org/10.1002/sam.11323","mag":"2257692640"},"language":"en","primary_location":{"id":"doi:10.1002/sam.11323","is_oa":false,"landing_page_url":"https://doi.org/10.1002/sam.11323","pdf_url":null,"source":{"id":"https://openalex.org/S40788348","display_name":"Statistical Analysis and Data Mining The ASA Data Science Journal","issn_l":"1932-1864","issn":["1932-1864","1932-1872"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320595","host_organization_name":"Wiley","host_organization_lineage":["https://openalex.org/P4310320595"],"host_organization_lineage_names":["Wiley"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Statistical Analysis and Data Mining: The ASA Data Science Journal","raw_type":"journal-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1511.06798","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5003422060","display_name":"Luke Miratrix","orcid":"https://orcid.org/0000-0002-0078-1906"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Luke Miratrix","raw_affiliation_strings":["Harvard Graduate School of Education Cambridge MA USA","Harvard Graduate School of Education , Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Harvard Graduate School of Education Cambridge MA USA","institution_ids":[]},{"raw_affiliation_string":"Harvard Graduate School of Education , Cambridge, MA, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5008455933","display_name":"Robin Ackerman","orcid":null},"institutions":[{"id":"https://openalex.org/I1312264882","display_name":"United States Department of Labor","ror":"https://ror.org/05mbkbj54","country_code":"US","type":"government","lineage":["https://openalex.org/I1312264882"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Robin Ackerman","raw_affiliation_strings":["US Department of Labor Boston MA USA","US Department of Labor  Boston MA USA"],"affiliations":[{"raw_affiliation_string":"US Department of Labor Boston MA USA","institution_ids":["https://openalex.org/I1312264882"]},{"raw_affiliation_string":"US Department of Labor  Boston MA USA","institution_ids":["https://openalex.org/I1312264882"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5003422060"],"corresponding_institution_ids":[],"apc_list":{"value":3760,"currency":"USD","value_usd":3760},"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.00340186,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"9","issue":"6","first_page":"435","last_page":"460"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.9078233242034912},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7402708530426025},{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.7163397073745728},{"id":"https://openalex.org/keywords/latent-dirichlet-allocation","display_name":"Latent Dirichlet allocation","score":0.6829953193664551},{"id":"https://openalex.org/keywords/topic-model","display_name":"Topic model","score":0.6412749290466309},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5631147623062134},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5574676990509033},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.503192126750946},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4762694537639618},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4708934426307678},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.46431609988212585},{"id":"https://openalex.org/keywords/feature-selection","display_name":"Feature selection","score":0.4628208577632904},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.4312683939933777},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.42729681730270386},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.41493383049964905},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.13475334644317627}],"concepts":[{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.9078233242034912},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7402708530426025},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.7163397073745728},{"id":"https://openalex.org/C500882744","wikidata":"https://www.wikidata.org/wiki/Q269236","display_name":"Latent Dirichlet allocation","level":3,"score":0.6829953193664551},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.6412749290466309},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5631147623062134},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5574676990509033},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.503192126750946},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4762694537639618},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4708934426307678},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.46431609988212585},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.4628208577632904},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.4312683939933777},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.42729681730270386},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.41493383049964905},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.13475334644317627},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.1002/sam.11323","is_oa":false,"landing_page_url":"https://doi.org/10.1002/sam.11323","pdf_url":null,"source":{"id":"https://openalex.org/S40788348","display_name":"Statistical Analysis and Data Mining The ASA Data Science Journal","issn_l":"1932-1864","issn":["1932-1864","1932-1872"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320595","host_organization_name":"Wiley","host_organization_lineage":["https://openalex.org/P4310320595"],"host_organization_lineage_names":["Wiley"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Statistical Analysis and Data Mining: The ASA Data Science Journal","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:1511.06798","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1511.06798","pdf_url":"https://arxiv.org/pdf/1511.06798","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:2257692640","is_oa":true,"landing_page_url":"http://export.arxiv.org/pdf/1511.06798","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"pmh:oai:dash.harvard.edu:1/30828256","is_oa":true,"landing_page_url":"http://nrs.harvard.edu/urn-3:HUL.InstRepos:30828256","pdf_url":null,"source":{"id":"https://openalex.org/S4306401540","display_name":"Digital Access to Scholarship at Harvard (DASH) (Harvard University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I136199984","host_organization_name":"Harvard University","host_organization_lineage":["https://openalex.org/I136199984"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Journal Article"},{"id":"doi:10.48550/arxiv.1511.06798","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1511.06798","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1511.06798","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1511.06798","pdf_url":"https://arxiv.org/pdf/1511.06798","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.6499999761581421,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":53,"referenced_works":["https://openalex.org/W6436850","https://openalex.org/W24521930","https://openalex.org/W1480376833","https://openalex.org/W1524688041","https://openalex.org/W1540550673","https://openalex.org/W1543911290","https://openalex.org/W1547032355","https://openalex.org/W1618978521","https://openalex.org/W1857358525","https://openalex.org/W1880262756","https://openalex.org/W1907578970","https://openalex.org/W1973193168","https://openalex.org/W1978394996","https://openalex.org/W2005422315","https://openalex.org/W2011438897","https://openalex.org/W2013850411","https://openalex.org/W2020925091","https://openalex.org/W2038411619","https://openalex.org/W2043478128","https://openalex.org/W2053463056","https://openalex.org/W2063978378","https://openalex.org/W2071106922","https://openalex.org/W2088003405","https://openalex.org/W2091172738","https://openalex.org/W2097360283","https://openalex.org/W2098162425","https://openalex.org/W2102587111","https://openalex.org/W2103333826","https://openalex.org/W2119821739","https://openalex.org/W2122825543","https://openalex.org/W2122883885","https://openalex.org/W2124549482","https://openalex.org/W2133286915","https://openalex.org/W2145767445","https://openalex.org/W2153383412","https://openalex.org/W2159426623","https://openalex.org/W2161068733","https://openalex.org/W2161793142","https://openalex.org/W2165279024","https://openalex.org/W2166183437","https://openalex.org/W2169112940","https://openalex.org/W2170654002","https://openalex.org/W2171060319","https://openalex.org/W2405916000","https://openalex.org/W2407159764","https://openalex.org/W2435251607","https://openalex.org/W2787894218","https://openalex.org/W3105543546","https://openalex.org/W3124398150","https://openalex.org/W4237791300","https://openalex.org/W4239510810","https://openalex.org/W4241931738","https://openalex.org/W4294541781"],"related_works":["https://openalex.org/W2964089330","https://openalex.org/W2102942417","https://openalex.org/W3144879261","https://openalex.org/W2947781947","https://openalex.org/W2901020213","https://openalex.org/W2807739445","https://openalex.org/W2907632654","https://openalex.org/W2096264705","https://openalex.org/W2985050317","https://openalex.org/W3016671332","https://openalex.org/W2540934360","https://openalex.org/W1499182410","https://openalex.org/W2743665309","https://openalex.org/W3138228214","https://openalex.org/W2800637552","https://openalex.org/W2974495808","https://openalex.org/W2972327144","https://openalex.org/W3103251639","https://openalex.org/W2218641061","https://openalex.org/W2396188861"],"abstract_inverted_index":{"We":[0,161,180,204],"propose":[1],"a":[2,72,95,110,144,207,228],"general":[3],"framework":[4],"for":[5,19,42,157],"topic\u2010specific":[6],"summarization":[7,64],"of":[8,34,98,112,118,132,152,168,174,192,230],"large":[9],"text":[10,224],"corpora,":[11],"and":[12,29,36,51,83,123,177,189,226,247],"illustrate":[13],"how":[14,163],"it":[15],"can":[16,149,171],"be":[17,234],"used":[18],"analysis":[20,225],"in":[21,44,80,125,201,223,237],"two":[22],"quite":[23],"different":[24],"contexts:":[25],"an":[26],"Occupational":[27],"Safety":[28],"Health":[30],"Administration":[31],"(OSHA)":[32],"database":[33],"fatality":[35],"catastrophe":[37],"reports":[38],"(to":[39,58],"facilitate":[40],"surveillance":[41],"patterns":[43],"circumstances":[45],"leading":[46],"to":[47,197,221],"injury":[48],"or":[49,104],"death),":[50],"legal":[52],"decisions":[53],"on":[54,67,165],"workers'":[55],"compensation":[56],"claims":[57],"explore":[59],"relevant":[60],"case":[61],"law).":[62],"Our":[63],"framework,":[65],"built":[66],"sparse":[68,217],"classification":[69],"methods,":[70],"is":[71,227],"compromise":[73],"between":[74],"simple":[75],"word":[76,195],"frequency\u2010based":[77],"methods":[78,87,200,218],"currently":[79],"wide":[81],"use,":[82],"more":[84],"heavyweight,":[85],"model\u2010intensive":[86],"such":[88],"as":[89,135,140],"latent":[90],"Dirichlet":[91],"allocation":[92],"(LDA).":[93],"For":[94],"particular":[96],"topic":[97],"interest":[99],"(e.g.,":[100],"mental":[101],"health":[102],"disability,":[103],"carbon":[105],"monoxide":[106],"exposure),":[107],"we":[108,214],"regress":[109],"labeling":[111],"documents":[113],"onto":[114],"the":[115,120,126,141,166,169,186,193,202],"high\u2010dimensional":[116],"counts":[117],"all":[119],"other":[121,199],"words":[122],"phrases":[124,133,151],"documents.":[127],"The":[128,250],"resulting":[129,194],"small":[130],"set":[131],"found":[134],"predictive":[136],"are":[137],"then":[138],"harvested":[139],"summary.":[142],"Using":[143],"branch\u2010and\u2010bound":[145],"approach,":[146],"this":[147,182,238],"method":[148],"incorporate":[150],"arbitrary":[153],"length,":[154],"which":[155],"allows":[156],"potentially":[158],"rich":[159],"summarization.":[160],"discuss":[162],"focus":[164],"purpose":[167],"summaries":[170],"inform":[172],"choices":[173],"tuning":[175],"parameters":[176],"model":[178],"constraints.":[179],"evaluate":[181],"tool":[183],"by":[184],"comparing":[185],"computational":[187],"time":[188],"summary":[190],"statistics":[191],"lists":[196],"three":[198],"literature.":[203],"also":[205],"present":[206],"new":[208],"R":[209],"package,":[210],"textreg":[211],".":[212],"Overall,":[213],"argue":[215],"that":[216,232],"have":[219],"much":[220],"offer":[222],"branch":[229],"research":[231],"should":[233],"considered":[235],"further":[236],"context.":[239],"\u00a9":[240],"2016":[241,255],"Wiley":[242],"Periodicals,":[243],"Inc.":[244],"Statistical":[245],"Analysis":[246],"Data":[248,252],"Mining:":[249],"ASA":[251],"Science":[253],"Journal,":[254]},"counts_by_year":[],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
