{"id":"https://openalex.org/W2053710965","doi":"https://doi.org/10.1109/icacci.2014.6968377","title":"Structural analysis and regular expressions based noise elimination from web pages for web content mining","display_name":"Structural analysis and regular expressions based noise elimination from web pages for web content mining","publication_year":2014,"publication_date":"2014-09-01","ids":{"openalex":"https://openalex.org/W2053710965","doi":"https://doi.org/10.1109/icacci.2014.6968377","mag":"2053710965"},"language":"en","primary_location":{"id":"doi:10.1109/icacci.2014.6968377","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icacci.2014.6968377","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2014 International Conference on Advances in Computing, Communications and Informatics (ICACCI)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006644558","display_name":"Amit Dutta","orcid":"https://orcid.org/0000-0002-6058-6301"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Amit Dutta","raw_affiliation_strings":["Department of Information Technology, St. Thomas' College of Engineering and Technology, Kolkata, West Bengal, India","Department of Information Technology, St. Thomas' College of Engineering and Technology, 4, D.H. Road, Kolkata - 700023, West Bengal, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Information Technology, St. Thomas' College of Engineering and Technology, Kolkata, West Bengal, India","institution_ids":[]},{"raw_affiliation_string":"Department of Information Technology, St. Thomas' College of Engineering and Technology, 4, D.H. Road, Kolkata - 700023, West Bengal, India","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092645197","display_name":"Sudipta Paria","orcid":"https://orcid.org/0009-0002-7726-8032"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sudipta Paria","raw_affiliation_strings":["Department of Information Technology, St. Thomas' College of Engineering and Technology, Kolkata, West Bengal, India","Department of Computer Science & Engineering, St. Thomas' College of Engineering and Technology, 4, D.H. Road, Kolkata - 700023, West Bengal, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Information Technology, St. Thomas' College of Engineering and Technology, Kolkata, West Bengal, India","institution_ids":[]},{"raw_affiliation_string":"Department of Computer Science & Engineering, St. Thomas' College of Engineering and Technology, 4, D.H. Road, Kolkata - 700023, West Bengal, India","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077904094","display_name":"Tanmoy Golui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tanmoy Golui","raw_affiliation_strings":["Department of Information Technology, St. Thomas' College of Engineering and Technology, Kolkata, West Bengal, India","Department of Computer Science & Engineering, St. Thomas' College of Engineering and Technology, 4, D.H. Road, Kolkata - 700023, West Bengal, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Information Technology, St. Thomas' College of Engineering and Technology, Kolkata, West Bengal, India","institution_ids":[]},{"raw_affiliation_string":"Department of Computer Science & Engineering, St. Thomas' College of Engineering and Technology, 4, D.H. Road, Kolkata - 700023, West Bengal, India","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5083612949","display_name":"Dipak Kumar Kole","orcid":"https://orcid.org/0000-0002-6939-6993"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dipak K. Kole","raw_affiliation_strings":["Department of Information Technology, St. Thomas' College of Engineering and Technology, Kolkata, West Bengal, India","Department of Computer Science & Engineering, St. Thomas' College of Engineering and Technology, 4, D.H. Road, Kolkata - 700023, West Bengal, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Information Technology, St. Thomas' College of Engineering and Technology, Kolkata, West Bengal, India","institution_ids":[]},{"raw_affiliation_string":"Department of Computer Science & Engineering, St. Thomas' College of Engineering and Technology, 4, D.H. Road, Kolkata - 700023, West Bengal, India","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.8204,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.8261098,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":"97","issue":null,"first_page":"1445","last_page":"1451"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9775999784469604,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9465000033378601,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.8169581890106201},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8007444739341736},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6121835708618164},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.5935121774673462},{"id":"https://openalex.org/keywords/static-web-page","display_name":"Static web page","score":0.5238747000694275},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.5198491215705872},{"id":"https://openalex.org/keywords/presentation","display_name":"Presentation (obstetrics)","score":0.505158007144928},{"id":"https://openalex.org/keywords/web-mining","display_name":"Web mining","score":0.49294257164001465},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.4505617022514343},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3715304732322693},{"id":"https://openalex.org/keywords/web-navigation","display_name":"Web navigation","score":0.2662278413772583},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.16896352171897888}],"concepts":[{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.8169581890106201},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8007444739341736},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6121835708618164},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5935121774673462},{"id":"https://openalex.org/C173576120","wikidata":"https://www.wikidata.org/wiki/Q2641220","display_name":"Static web page","level":4,"score":0.5238747000694275},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.5198491215705872},{"id":"https://openalex.org/C2777601897","wikidata":"https://www.wikidata.org/wiki/Q3409113","display_name":"Presentation (obstetrics)","level":2,"score":0.505158007144928},{"id":"https://openalex.org/C197046077","wikidata":"https://www.wikidata.org/wiki/Q785337","display_name":"Web mining","level":3,"score":0.49294257164001465},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.4505617022514343},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3715304732322693},{"id":"https://openalex.org/C61096286","wikidata":"https://www.wikidata.org/wiki/Q7978592","display_name":"Web navigation","level":3,"score":0.2662278413772583},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.16896352171897888},{"id":"https://openalex.org/C126838900","wikidata":"https://www.wikidata.org/wiki/Q77604","display_name":"Radiology","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.0},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icacci.2014.6968377","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icacci.2014.6968377","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2014 International Conference on Advances in Computing, Communications and Informatics (ICACCI)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W174477639","https://openalex.org/W1601674470","https://openalex.org/W1989338554","https://openalex.org/W2012451152","https://openalex.org/W2040075907","https://openalex.org/W2124436456","https://openalex.org/W2162407272","https://openalex.org/W2435251607","https://openalex.org/W2479182518","https://openalex.org/W4244354591","https://openalex.org/W6717827561"],"related_works":["https://openalex.org/W1979144454","https://openalex.org/W2069679074","https://openalex.org/W4239898202","https://openalex.org/W1559090489","https://openalex.org/W3116613346","https://openalex.org/W2055154498","https://openalex.org/W2115253914","https://openalex.org/W2316185946","https://openalex.org/W2351131669","https://openalex.org/W2376574812"],"abstract_inverted_index":{"Commercial":[0],"websites":[1,193],"usually":[2],"contain":[3],"noisy":[4,157],"information":[5,12,30],"blocks":[6,103,139],"along":[7],"with":[8],"main":[9],"content.":[10],"Noisy":[11,102,162],"degrades":[13],"the":[14,32,53,68,81,85,100,125,129,195,200,203],"performance":[15],"of":[16,52,99,116,128,156,202],"web":[17,33,54,82,87,114,131,166],"content":[18,21],"mining.":[19],"Web":[20],"mining":[22],"is":[23,62,71,89,177,187],"used":[24,72],"for":[25,91],"discovering":[26],"useful":[27],"knowledge":[28],"or":[29,109,145],"from":[31,133],"page.":[34,55,83],"In":[35,119],"this":[36],"paper,":[37],"we":[38,123],"propose":[39],"noise":[40,182],"elimination":[41,183],"method":[42,61,70,152],"that":[43],"uses":[44],"tag":[45,58],"based":[46,59],"filtering":[47,60,69],"followed":[48],"by":[49,64],"structural":[50,92,120,160],"analysis":[51,93,121],"The":[56,171,185],"proposed":[57,204],"implemented":[63],"regular":[65],"expression.":[66],"Firstly,":[67],"to":[73,94,136,180],"remove":[74,95,149],"several":[75,190],"predefined":[76],"HTML":[77,126],"tags":[78],"present":[79],"in":[80,112,164],"Then":[84],"concise":[86],"page":[88,115],"taken":[90],"remaining":[96],"noise.":[97],"Most":[98],"time":[101,175],"share":[104],"same":[105,141],"contents":[106,127,142,158,163],"and":[107,143,148,174,194],"layouts":[108,144],"presentation":[110,146],"styles":[111,147],"every":[113],"a":[117,134],"website.":[118],"phase,":[122],"compare":[124],"crawled":[130,165],"pages":[132,167],"website":[135],"capture":[137],"common":[138],"having":[140],"them.":[150],"Filtering":[151],"eliminates":[153],"considerable":[154],"amount":[155],"before":[159],"analysis.":[161],"get":[168],"reduced":[169],"significantly.":[170],"overall":[172],"space":[173],"complexity":[176],"less":[178],"compared":[179],"other":[181],"approach.":[184],"experiment":[186],"conducted":[188],"on":[189],"popular":[191],"commercial":[192],"results":[196],"are":[197],"shown":[198],"exposing":[199],"efficiency":[201],"method.":[205]},"counts_by_year":[{"year":2023,"cited_by_count":1},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
