{"id":"https://openalex.org/W7161114948","doi":"https://doi.org/10.1007/s00607-026-01666-5","title":"LLMs applied to web scraping and web crawling: a systematic review","display_name":"LLMs applied to web scraping and web crawling: a systematic review","publication_year":2026,"publication_date":"2026-05-14","ids":{"openalex":"https://openalex.org/W7161114948","doi":"https://doi.org/10.1007/s00607-026-01666-5"},"language":"en","primary_location":{"id":"doi:10.1007/s00607-026-01666-5","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00607-026-01666-5","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00607-026-01666-5.pdf","source":{"id":"https://openalex.org/S35593046","display_name":"Computing","issn_l":"0010-485X","issn":["0010-485X","1436-5057"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s00607-026-01666-5.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136165315","display_name":"Pablo Landeta-L\u00f3pez","orcid":"https://orcid.org/0000-0002-2914-8696"},"institutions":[{"id":"https://openalex.org/I4210130315","display_name":"Universidad T\u00e9cnica del Norte","ror":"https://ror.org/03f0t8b71","country_code":"EC","type":"education","lineage":["https://openalex.org/I4210130315"]},{"id":"https://openalex.org/I79238269","display_name":"Universidad de Sevilla","ror":"https://ror.org/03yxnpp24","country_code":"ES","type":"education","lineage":["https://openalex.org/I79238269"]}],"countries":["EC","ES"],"is_corresponding":true,"raw_author_name":"Pablo Landeta-L\u00f3pez","raw_affiliation_strings":["SCORE Lab, I3US Institute, Universidad de Sevilla, Seville, Spain","Universidad T\u00e9cnica del Norte, Ibarra, Ecuador"],"raw_orcid":"https://orcid.org/0000-0002-2914-8696","affiliations":[{"raw_affiliation_string":"SCORE Lab, I3US Institute, Universidad de Sevilla, Seville, Spain","institution_ids":["https://openalex.org/I79238269"]},{"raw_affiliation_string":"Universidad T\u00e9cnica del Norte, Ibarra, Ecuador","institution_ids":["https://openalex.org/I4210130315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102007313","display_name":"Jos\u00e9 Mar\u00eda Garc\u00eda","orcid":"https://orcid.org/0000-0002-0303-2740"},"institutions":[{"id":"https://openalex.org/I79238269","display_name":"Universidad de Sevilla","ror":"https://ror.org/03yxnpp24","country_code":"ES","type":"education","lineage":["https://openalex.org/I79238269"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Jos\u00e9 Mar\u00eda Garc\u00eda","raw_affiliation_strings":["SCORE Lab, I3US Institute, Universidad de Sevilla, Seville, Spain"],"raw_orcid":"https://orcid.org/0000-0002-0303-2740","affiliations":[{"raw_affiliation_string":"SCORE Lab, I3US Institute, Universidad de Sevilla, Seville, Spain","institution_ids":["https://openalex.org/I79238269"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130545187","display_name":"Cathy Guevara-Vega","orcid":null},"institutions":[{"id":"https://openalex.org/I4210130315","display_name":"Universidad T\u00e9cnica del Norte","ror":"https://ror.org/03f0t8b71","country_code":"EC","type":"education","lineage":["https://openalex.org/I4210130315"]}],"countries":["EC"],"is_corresponding":false,"raw_author_name":"Cathy Guevara-Vega","raw_affiliation_strings":["Universidad T\u00e9cnica del Norte, Ibarra, Ecuador"],"raw_orcid":"https://orcid.org/0000-0002-2470-8287","affiliations":[{"raw_affiliation_string":"Universidad T\u00e9cnica del Norte, Ibarra, Ecuador","institution_ids":["https://openalex.org/I4210130315"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5121810183","display_name":"Antonio Ruiz-Cort\u00e9s","orcid":null},"institutions":[{"id":"https://openalex.org/I79238269","display_name":"Universidad de Sevilla","ror":"https://ror.org/03yxnpp24","country_code":"ES","type":"education","lineage":["https://openalex.org/I79238269"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Antonio Ruiz-Cort\u00e9s","raw_affiliation_strings":["SCORE Lab, I3US Institute, Universidad de Sevilla, Seville, Spain"],"raw_orcid":"https://orcid.org/0000-0001-9827-1834","affiliations":[{"raw_affiliation_string":"SCORE Lab, I3US Institute, Universidad de Sevilla, Seville, Spain","institution_ids":["https://openalex.org/I79238269"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5136165315"],"corresponding_institution_ids":["https://openalex.org/I4210130315","https://openalex.org/I79238269"],"apc_list":{"value":2290,"currency":"EUR","value_usd":2890},"apc_paid":{"value":2290,"currency":"EUR","value_usd":2890},"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.92270669,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":"108","issue":"6","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.7599999904632568,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.7599999904632568,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12479","display_name":"Web Application Security Vulnerabilities","score":0.0681999996304512,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.02419999986886978,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantic-web","display_name":"Semantic Web","score":0.5264999866485596},{"id":"https://openalex.org/keywords/crawling","display_name":"Crawling","score":0.4569000005722046},{"id":"https://openalex.org/keywords/web-application","display_name":"Web application","score":0.382999986410141},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.38100001215934753},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.36309999227523804},{"id":"https://openalex.org/keywords/html","display_name":"HTML","score":0.3619000017642975},{"id":"https://openalex.org/keywords/web-standards","display_name":"Web standards","score":0.33730000257492065},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.33320000767707825}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.640999972820282},{"id":"https://openalex.org/C2129575","wikidata":"https://www.wikidata.org/wiki/Q54837","display_name":"Semantic Web","level":2,"score":0.5264999866485596},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.5200999975204468},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.47940000891685486},{"id":"https://openalex.org/C100368936","wikidata":"https://www.wikidata.org/wiki/Q1411725","display_name":"Crawling","level":2,"score":0.4569000005722046},{"id":"https://openalex.org/C118643609","wikidata":"https://www.wikidata.org/wiki/Q189210","display_name":"Web application","level":2,"score":0.382999986410141},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.38100001215934753},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.36309999227523804},{"id":"https://openalex.org/C138708601","wikidata":"https://www.wikidata.org/wiki/Q8811","display_name":"HTML","level":3,"score":0.3619000017642975},{"id":"https://openalex.org/C182321512","wikidata":"https://www.wikidata.org/wiki/Q1153289","display_name":"Web standards","level":3,"score":0.33730000257492065},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.33320000767707825},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.33149999380111694},{"id":"https://openalex.org/C2777466982","wikidata":"https://www.wikidata.org/wiki/Q5227287","display_name":"Data extraction","level":3,"score":0.31700000166893005},{"id":"https://openalex.org/C97200028","wikidata":"https://www.wikidata.org/wiki/Q1196135","display_name":"Web engineering","level":5,"score":0.29739999771118164},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2957000136375427},{"id":"https://openalex.org/C162005631","wikidata":"https://www.wikidata.org/wiki/Q54837","display_name":"Data Web","level":3,"score":0.2822999954223633},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C189708586","wikidata":"https://www.wikidata.org/wiki/Q1504425","display_name":"Systematic review","level":3,"score":0.2689000070095062},{"id":"https://openalex.org/C534406577","wikidata":"https://www.wikidata.org/wiki/Q7550843","display_name":"Social Semantic Web","level":3,"score":0.26669999957084656},{"id":"https://openalex.org/C130436687","wikidata":"https://www.wikidata.org/wiki/Q7978591","display_name":"Web modeling","level":3,"score":0.26499998569488525},{"id":"https://openalex.org/C79373723","wikidata":"https://www.wikidata.org/wiki/Q386275","display_name":"Web development","level":3,"score":0.2538999915122986},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.25209999084472656},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s00607-026-01666-5","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00607-026-01666-5","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00607-026-01666-5.pdf","source":{"id":"https://openalex.org/S35593046","display_name":"Computing","issn_l":"0010-485X","issn":["0010-485X","1436-5057"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computing","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s00607-026-01666-5","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00607-026-01666-5","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00607-026-01666-5.pdf","source":{"id":"https://openalex.org/S35593046","display_name":"Computing","issn_l":"0010-485X","issn":["0010-485X","1436-5057"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computing","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320310967","display_name":"Universidad de Sevilla","ror":"https://ror.org/03yxnpp24"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7161114948.pdf","grobid_xml":"https://content.openalex.org/works/W7161114948.grobid-xml"},"referenced_works_count":110,"referenced_works":["https://openalex.org/W1999798506","https://openalex.org/W2037163971","https://openalex.org/W2106956101","https://openalex.org/W2164777277","https://openalex.org/W2573665256","https://openalex.org/W2805236450","https://openalex.org/W2981089724","https://openalex.org/W2999962654","https://openalex.org/W3014183340","https://openalex.org/W3043293716","https://openalex.org/W3045961212","https://openalex.org/W3092501800","https://openalex.org/W3118615836","https://openalex.org/W3123893780","https://openalex.org/W3141363959","https://openalex.org/W3142584950","https://openalex.org/W3168630447","https://openalex.org/W3199748762","https://openalex.org/W3207535763","https://openalex.org/W3213241618","https://openalex.org/W3215669057","https://openalex.org/W4200146754","https://openalex.org/W4220959955","https://openalex.org/W4283264542","https://openalex.org/W4293218986","https://openalex.org/W4296396554","https://openalex.org/W4321108071","https://openalex.org/W4321457507","https://openalex.org/W4367046714","https://openalex.org/W4382246105","https://openalex.org/W4384069089","https://openalex.org/W4384211302","https://openalex.org/W4385625400","https://openalex.org/W4386730038","https://openalex.org/W4387394008","https://openalex.org/W4387869777","https://openalex.org/W4388525526","https://openalex.org/W4390071549","https://openalex.org/W4392067260","https://openalex.org/W4392305859","https://openalex.org/W4394762995","https://openalex.org/W4399039431","https://openalex.org/W4399118910","https://openalex.org/W4400962047","https://openalex.org/W4402577210","https://openalex.org/W4402613453","https://openalex.org/W4402671444","https://openalex.org/W4402671963","https://openalex.org/W4402779033","https://openalex.org/W4402830620","https://openalex.org/W4403125040","https://openalex.org/W4403577823","https://openalex.org/W4403600666","https://openalex.org/W4403677821","https://openalex.org/W4403981245","https://openalex.org/W4404239369","https://openalex.org/W4404515062","https://openalex.org/W4404612057","https://openalex.org/W4404782707","https://openalex.org/W4404783707","https://openalex.org/W4404789553","https://openalex.org/W4404801167","https://openalex.org/W4405205161","https://openalex.org/W4405709931","https://openalex.org/W4406444723","https://openalex.org/W4406457866","https://openalex.org/W4406459909","https://openalex.org/W4406461680","https://openalex.org/W4406524538","https://openalex.org/W4406663050","https://openalex.org/W4406728422","https://openalex.org/W4406873056","https://openalex.org/W4407150845","https://openalex.org/W4407510795","https://openalex.org/W4407949990","https://openalex.org/W4407994682","https://openalex.org/W4408063039","https://openalex.org/W4408358826","https://openalex.org/W4408750423","https://openalex.org/W4408862815","https://openalex.org/W4409640673","https://openalex.org/W4409888796","https://openalex.org/W4410100017","https://openalex.org/W4410359094","https://openalex.org/W4410630753","https://openalex.org/W4410887803","https://openalex.org/W4411172651","https://openalex.org/W4411418916","https://openalex.org/W4411584271","https://openalex.org/W4411726787","https://openalex.org/W4411965795","https://openalex.org/W4412347848","https://openalex.org/W4412620316","https://openalex.org/W4412695056","https://openalex.org/W4412742207","https://openalex.org/W4413340479","https://openalex.org/W4413374908","https://openalex.org/W4413458158","https://openalex.org/W4414405811","https://openalex.org/W4414463610","https://openalex.org/W4414799339","https://openalex.org/W4415524135","https://openalex.org/W4416549320","https://openalex.org/W4417337888","https://openalex.org/W4417509881","https://openalex.org/W6966367167","https://openalex.org/W7081965627","https://openalex.org/W7083053435","https://openalex.org/W7124576615","https://openalex.org/W7131790735"],"related_works":[],"abstract_inverted_index":{"Abstract":[0],"The":[1,88,218],"integration":[2],"of":[3,57,96,129],"Large":[4],"Language":[5,211],"Models":[6,212],"(LLMs)":[7],"with":[8,94,115,198],"web":[9,17,235],"scraping":[10],"and":[11,24,44,60,74,84,113,122,148,168,184,196,240,249,261],"crawling":[12],"techniques":[13],"is":[14,134,205],"transforming":[15],"automated":[16],"data":[18,166],"extraction":[19],"by":[20],"enabling":[21,224],"semantic":[22],"understanding":[23],"adaptability.":[25],"This":[26],"Systematic":[27],"Literature":[28],"Review":[29],"(SLR)":[30],"synthesizes":[31],"evidence":[32],"regarding":[33],"this":[34],"integration,":[35],"focusing":[36],"on":[37,173],"tools,":[38],"models,":[39],"challenges,":[40],"evaluation":[41,263],"methods,":[42],"trends,":[43],"applications.":[45],"Following":[46],"PRISMA":[47],"guidelines,":[48],"we":[49],"conducted":[50],"a":[51,225],"rigorous":[52],"search":[53],"across":[54],"Scopus,":[55],"Web":[56],"Science,":[58],"ACM,":[59],"IEEE":[61],"databases":[62],"(2021\u20132025).":[63],"From":[64],"976":[65],"screened":[66],"records,":[67],"91":[68,130],"high-quality":[69],"studies":[70],"(53":[71],"conference":[72],"papers":[73],"38":[75],"journal":[76],"articles)":[77],"were":[78],"selected":[79],"after":[80],"duplicate":[81],"removal,":[82],"screening,":[83],"AI-powered":[85],"quality":[86],"assessment.":[87],"field":[89],"has":[90],"experienced":[91],"explosive":[92],"growth,":[93],"84%":[95],"publications":[97],"appearing":[98],"in":[99,103,106,140,146,155,234,238],"2024\u20132025":[100],"alone":[101],"(36":[102],"2024,":[104],"40":[105],"2025).":[107],"Key":[108],"tools":[109,118],"include":[110],"Scrapy,":[111],"BeautifulSoup,":[112],"Selenium,":[114],"emerging":[116],"LLM-augmented":[117],"like":[119],"Scrapeghost,":[120],"Crawl4AI,":[121],"ScrapeGraphAI.":[123],"While":[124],"transformer-based":[125],"models":[126],"dominate":[127],"(86":[128],"papers),":[131],"the":[132,136,143,206],"landscape":[133],"diversifying:":[135],"BERT":[137],"family":[138,145],"appears":[139],"23":[141],"studies,":[142],"GPT":[144],"34,":[147],"other":[149],"LLMs":[150,222],"(Llama,":[151],"Mistral,":[152],"Claude,":[153],"Gemini)":[154],"44.":[156],"Major":[157],"challenges":[158],"involve":[159],"HTML":[160],"complexity,":[161],"computational":[162],"costs,":[163],"token":[164],"limits,":[165],"biases,":[167],"legal":[169],"risks.":[170],"Evaluation":[171],"relies":[172],"hybrid":[174,259],"frameworks":[175],"combining":[176],"task-specific":[177],"metrics":[178],"(F1,":[179],"BLEU,":[180],"RAGAS),":[181],"human":[182],"validation,":[183],"operational":[185],"efficiency":[186,241],"measures.":[187],"Applications":[188],"span":[189],"Cybersecurity,":[190],"Healthcare,":[191],"Education,":[192],"E-commerce,":[193],"Media,":[194],"Technology,":[195],"Finance/Legal,":[197],"high":[199],"thematic":[200],"specialization.":[201],"A":[202],"notable":[203],"trend":[204],"shift":[207],"toward":[208,246],"efficient":[209],"Small":[210],"(SLMs)":[213],"for":[214],"resource-constrained,":[215],"domain-specific":[216],"tasks.":[217],"findings":[219],"suggest":[220],"that":[221],"are":[223],"decisive":[226],"transition":[227],"from":[228],"rule-based":[229],"to":[230],"semantic,":[231],"agentic":[232],"approaches":[233],"extraction.":[236],"Challenges":[237],"robustness":[239],"persist,":[242],"but":[243],"trends":[244],"point":[245],"intelligent,":[247],"domain-specialized,":[248],"ethically":[250],"aware":[251],"systems.":[252],"Future":[253],"work":[254],"should":[255],"explore":[256],"SLM":[257],"implementation,":[258],"pipelines,":[260],"standardized":[262],"benchmarks.":[264]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2026-05-15T00:00:00"}
