{"id":"https://openalex.org/W3207097338","doi":"https://doi.org/10.1145/3474944.3474958","title":"Using Spark for Text Mining on Large Scale Liver Cancer Literature","display_name":"Using Spark for Text Mining on Large Scale Liver Cancer Literature","publication_year":2021,"publication_date":"2021-01-16","ids":{"openalex":"https://openalex.org/W3207097338","doi":"https://doi.org/10.1145/3474944.3474958","mag":"3207097338"},"language":"en","primary_location":{"id":"doi:10.1145/3474944.3474958","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3474944.3474958","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 the 3rd International Conference on Big Data Engineering and Technology (BDET)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5085762059","display_name":"Ming-Yen Lin","orcid":"https://orcid.org/0000-0003-3180-3132"},"institutions":[{"id":"https://openalex.org/I4880106","display_name":"Feng Chia University","ror":"https://ror.org/05vhczg54","country_code":"TW","type":"education","lineage":["https://openalex.org/I4880106"]}],"countries":["TW"],"is_corresponding":true,"raw_author_name":"Ming-Yen Lin","raw_affiliation_strings":["Feng Chia University, Taiwan"],"affiliations":[{"raw_affiliation_string":"Feng Chia University, Taiwan","institution_ids":["https://openalex.org/I4880106"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101018865","display_name":"Yu\u2010Ju Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I4880106","display_name":"Feng Chia University","ror":"https://ror.org/05vhczg54","country_code":"TW","type":"education","lineage":["https://openalex.org/I4880106"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Yu-Ju Lin","raw_affiliation_strings":["Feng Chia University, Taiwan"],"affiliations":[{"raw_affiliation_string":"Feng Chia University, Taiwan","institution_ids":["https://openalex.org/I4880106"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5077724127","display_name":"Sue-Chen Hsueh","orcid":"https://orcid.org/0000-0002-0330-6967"},"institutions":[{"id":"https://openalex.org/I126145234","display_name":"Chaoyang University of Technology","ror":"https://ror.org/04xwksx09","country_code":"TW","type":"education","lineage":["https://openalex.org/I126145234"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Sue-Chen Hsueh","raw_affiliation_strings":["Chaoyang University of Technology, Taiwan"],"affiliations":[{"raw_affiliation_string":"Chaoyang University of Technology, Taiwan","institution_ids":["https://openalex.org/I126145234"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5085762059"],"corresponding_institution_ids":["https://openalex.org/I4880106"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.10068617,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"82","last_page":"87"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11396","display_name":"Artificial Intelligence in Healthcare","score":0.9828000068664551,"subfield":{"id":"https://openalex.org/subfields/3605","display_name":"Health Information Management"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9786999821662903,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/support-vector-machine","display_name":"Support vector machine","score":0.7608920335769653},{"id":"https://openalex.org/keywords/liver-cancer","display_name":"Liver cancer","score":0.7199373245239258},{"id":"https://openalex.org/keywords/logistic-regression","display_name":"Logistic regression","score":0.6253920793533325},{"id":"https://openalex.org/keywords/cancer","display_name":"Cancer","score":0.6092278957366943},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.572546124458313},{"id":"https://openalex.org/keywords/spark","display_name":"SPARK (programming language)","score":0.5383762717247009},{"id":"https://openalex.org/keywords/lung-cancer","display_name":"Lung cancer","score":0.5223392248153687},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.5091213583946228},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4758434295654297},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.46531060338020325},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.40447551012039185},{"id":"https://openalex.org/keywords/medicine","display_name":"Medicine","score":0.2768898606300354},{"id":"https://openalex.org/keywords/oncology","display_name":"Oncology","score":0.15461406111717224},{"id":"https://openalex.org/keywords/internal-medicine","display_name":"Internal medicine","score":0.15158945322036743},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.07186079025268555}],"concepts":[{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.7608920335769653},{"id":"https://openalex.org/C2776231280","wikidata":"https://www.wikidata.org/wiki/Q623031","display_name":"Liver cancer","level":3,"score":0.7199373245239258},{"id":"https://openalex.org/C151956035","wikidata":"https://www.wikidata.org/wiki/Q1132755","display_name":"Logistic regression","level":2,"score":0.6253920793533325},{"id":"https://openalex.org/C121608353","wikidata":"https://www.wikidata.org/wiki/Q12078","display_name":"Cancer","level":2,"score":0.6092278957366943},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.572546124458313},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.5383762717247009},{"id":"https://openalex.org/C2776256026","wikidata":"https://www.wikidata.org/wiki/Q47912","display_name":"Lung cancer","level":2,"score":0.5223392248153687},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5091213583946228},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4758434295654297},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.46531060338020325},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.40447551012039185},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.2768898606300354},{"id":"https://openalex.org/C143998085","wikidata":"https://www.wikidata.org/wiki/Q162555","display_name":"Oncology","level":1,"score":0.15461406111717224},{"id":"https://openalex.org/C126322002","wikidata":"https://www.wikidata.org/wiki/Q11180","display_name":"Internal medicine","level":1,"score":0.15158945322036743},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.07186079025268555},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3474944.3474958","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3474944.3474958","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 the 3rd International Conference on Big Data Engineering and Technology (BDET)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6800000071525574,"id":"https://metadata.un.org/sdg/3","display_name":"Good health and well-being"}],"awards":[{"id":"https://openalex.org/G6314126196","display_name":null,"funder_award_id":"MOST109-2221-E-035-064","funder_id":"https://openalex.org/F4320322795","funder_display_name":"Ministry of Science and Technology, Taiwan"}],"funders":[{"id":"https://openalex.org/F4320322795","display_name":"Ministry of Science and Technology, Taiwan","ror":"https://ror.org/02kv4zf79"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2004147962","https://openalex.org/W2119821739","https://openalex.org/W2120751691","https://openalex.org/W2131614043","https://openalex.org/W2136964861","https://openalex.org/W2301801058","https://openalex.org/W2398422715","https://openalex.org/W2399179461","https://openalex.org/W2527654716","https://openalex.org/W2630720134","https://openalex.org/W2802716607","https://openalex.org/W4239510810"],"related_works":["https://openalex.org/W1975949872","https://openalex.org/W3159871278","https://openalex.org/W2230552005","https://openalex.org/W2905242764","https://openalex.org/W3109411864","https://openalex.org/W3017846737","https://openalex.org/W4379407450","https://openalex.org/W3003280185","https://openalex.org/W2613379984","https://openalex.org/W3198126144"],"abstract_inverted_index":{"Cancer":[0],"is":[1,15,76,90,131,160,172,185,197,223],"one":[2,18],"of":[3,7,11,48,182,220],"the":[4,33,73,102,180,190,207,213,218],"main":[5],"causes":[6],"death.":[8],"The":[9,46,57,225],"number":[10,47,58],"known":[12],"cancer":[13,34,53,120,135,159,238],"to-date":[14],"more":[16],"than":[17,177],"hundred.":[19],"Liver":[20],"cancer,":[21,27],"ranked":[22,41],"after":[23],"trachea":[24],"and":[25,39,179,199],"lung":[26],"has":[28],"long":[29],"been":[30],"high":[31,61],"in":[32,37,44,65,110,140,153,189],"leading":[35],"cause":[36],"Taiwan":[38],"even":[40],"as":[42,60,62],"second":[43],"2016.":[45],"scientific":[49],"articles":[50,89,121],"related":[51,132],"to":[52,85,133,157,187],"proliferates":[54],"every":[55],"year.":[56],"reaches":[59],"20":[63],"million":[64],"PubMed":[66,111],"so":[67,122],"that":[68,123,229,230],"discovering":[69],"useful":[70],"information":[71],"from":[72],"massive":[74],"collection":[75],"very":[77,91],"difficult.":[78],"In":[79,212],"addition,":[80],"using":[81,101,164,192],"a":[82,96,115],"single":[83],"machine":[84],"sift":[86],"through":[87],"these":[88],"time-consuming.":[92],"Therefore,":[93],"we":[94],"present":[95],"big":[97],"data":[98],"analytic":[99],"framework":[100],"distributed":[103],"Apache":[104],"Spark":[105,141],"platform":[106],"for":[107,118],"text":[108],"mining":[109],"literature.":[112],"We":[113],"establish":[114],"prediction":[116,232],"model":[117,233],"liver":[119,134,158,237],"researchers":[124],"may":[125,209,234],"effectively":[126,235],"validate":[127],"whether":[128],"an":[129],"article":[130],"or":[136],"not.":[137],"Classification":[138],"models":[139],"MLlib":[142],"including":[143],"Linear":[144],"Support":[145],"Vector":[146],"Machines":[147],"(SVM),":[148],"Logistic":[149,170],"Regression,":[150],"are":[151],"used":[152],"our":[154,231],"experiments.":[155],"Relevancy":[156],"further":[161],"confirmed":[162],"by":[163],"MeSH":[165],"(Medical":[166],"Subject":[167],"Headings)":[168],"terms.":[169],"regression":[171],"about":[173],"3":[174],"times":[175],"faster":[176],"SVMs":[178,221],"accuracy":[181,208,219],"both":[183],"methods":[184,222],"close":[186],"95%":[188],"experiments":[191,214],"hold-out":[193],"validation.":[194],"When":[195],"max_features":[196],"500":[198],"min_df":[200,204],"\u2264":[201],"0.1":[202],"(or":[203],"=":[205],"1),":[206],"reach":[210],"96%.":[211,224],"with":[215],"K-fold":[216],"cross-validation,":[217],"experimental":[226],"results":[227],"show":[228],"classify":[236],"articles.":[239]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-02-27T16:54:17.756197","created_date":"2025-10-10T00:00:00"}
