{"id":"https://openalex.org/W4409657349","doi":"https://doi.org/10.1145/3696410.3714546","title":"HtmlRAG: HTML is Better Than Plain Text for Modeling Retrieved Knowledge in RAG Systems","display_name":"HtmlRAG: HTML is Better Than Plain Text for Modeling Retrieved Knowledge in RAG Systems","publication_year":2025,"publication_date":"2025-04-22","ids":{"openalex":"https://openalex.org/W4409657349","doi":"https://doi.org/10.1145/3696410.3714546"},"language":"en","primary_location":{"id":"doi:10.1145/3696410.3714546","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3696410.3714546","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3696410.3714546","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3696410.3714546","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5026689688","display_name":"Jiejun Tan","orcid":"https://orcid.org/0009-0001-8106-4780"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiejun Tan","raw_affiliation_strings":["Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China","institution_ids":["https://openalex.org/I78988378"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010558184","display_name":"Zhicheng Dou","orcid":"https://orcid.org/0000-0002-9781-948X"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhicheng Dou","raw_affiliation_strings":["Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China","institution_ids":["https://openalex.org/I78988378"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101052030","display_name":"Wen Wang","orcid":"https://orcid.org/0000-0002-6733-1838"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen Wang","raw_affiliation_strings":["Baichuan Intelligent Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baichuan Intelligent Technology, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044870952","display_name":"M. W. Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mang Wang","raw_affiliation_strings":["Baichuan Intelligent Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baichuan Intelligent Technology, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035684435","display_name":"Weipeng Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weipeng Chen","raw_affiliation_strings":["Baichuan Intelligent Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baichuan Intelligent Technology, Beijing, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5025631695","display_name":"Ji-Rong Wen","orcid":"https://orcid.org/0000-0002-9777-9676"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ji-Rong Wen","raw_affiliation_strings":["Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China","institution_ids":["https://openalex.org/I78988378"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5026689688"],"corresponding_institution_ids":["https://openalex.org/I78988378"],"apc_list":null,"apc_paid":null,"fwci":23.1692,"has_fulltext":true,"cited_by_count":10,"citation_normalized_percentile":{"value":0.99313831,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1733","last_page":"1746"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9789999723434448,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9625999927520752,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.773769736289978},{"id":"https://openalex.org/keywords/plain-text","display_name":"Plain text","score":0.7553808689117432},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.42033326625823975},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.32886022329330444},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.08400562405586243}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.773769736289978},{"id":"https://openalex.org/C46503548","wikidata":"https://www.wikidata.org/wiki/Q1145976","display_name":"Plain text","level":3,"score":0.7553808689117432},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.42033326625823975},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.32886022329330444},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.08400562405586243},{"id":"https://openalex.org/C148730421","wikidata":"https://www.wikidata.org/wiki/Q141090","display_name":"Encryption","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3696410.3714546","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3696410.3714546","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3696410.3714546","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3696410.3714546","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3696410.3714546","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3696410.3714546","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1899978318","display_name":null,"funder_award_id":"L233008","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G228125052","display_name":null,"funder_award_id":"62272467","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2884910486","display_name":null,"funder_award_id":"Technology","funder_id":"https://openalex.org/F4320322724","funder_display_name":"Ministry of Education, India"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5939423041","display_name":null,"funder_award_id":"Technology","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6809183074","display_name":null,"funder_award_id":"Project No.","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322499","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92"},{"id":"https://openalex.org/F4320322724","display_name":"Ministry of Education, India","ror":"https://ror.org/048xjjh50"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4409657349.pdf","grobid_xml":"https://content.openalex.org/works/W4409657349.grobid-xml"},"referenced_works_count":47,"referenced_works":["https://openalex.org/W2613959975","https://openalex.org/W2889787757","https://openalex.org/W2912924812","https://openalex.org/W2930957955","https://openalex.org/W2950681488","https://openalex.org/W2963339397","https://openalex.org/W3099700870","https://openalex.org/W3121976951","https://openalex.org/W3173325518","https://openalex.org/W4252076394","https://openalex.org/W4367310778","https://openalex.org/W4385570777","https://openalex.org/W4385573898","https://openalex.org/W4387846431","https://openalex.org/W4389519118","https://openalex.org/W4389519153","https://openalex.org/W4389519598","https://openalex.org/W4389519918","https://openalex.org/W4389520286","https://openalex.org/W4389524398","https://openalex.org/W4392126139","https://openalex.org/W4393147971","https://openalex.org/W4395443445","https://openalex.org/W4396722757","https://openalex.org/W4396921943","https://openalex.org/W4398795713","https://openalex.org/W4400524654","https://openalex.org/W4401042773","https://openalex.org/W4402671236","https://openalex.org/W4402671835","https://openalex.org/W4402671857","https://openalex.org/W4402672004","https://openalex.org/W4402672112","https://openalex.org/W4402684030","https://openalex.org/W4402684169","https://openalex.org/W4402715021","https://openalex.org/W4405470551","https://openalex.org/W6600062020","https://openalex.org/W6600109629","https://openalex.org/W6600238479","https://openalex.org/W6600424091","https://openalex.org/W6601613822","https://openalex.org/W6601903828","https://openalex.org/W6603036350","https://openalex.org/W6603135882","https://openalex.org/W6603143895","https://openalex.org/W6608103894"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Retrieval-Augmented":[0],"Generation":[1],"(RAG)":[2],"has":[3],"been":[4],"shown":[5],"to":[6,77,147,174,193],"improve":[7],"knowledge":[8,25,124,137],"capabilities":[9],"and":[10,30,58,86,95,141,164,172,187,219],"alleviate":[11,106],"the":[12,56,64,75,79,84,120,175,195,199,209],"hallucination":[13],"problem":[14],"of":[15,23,55,83,116,122,201,211],"LLMs.":[16],"The":[17],"Web":[18,37],"is":[19,98,130],"a":[20,188],"major":[21,42],"source":[22],"external":[24,139],"used":[26,36],"in":[27,90,125,135,138,214],"RAG":[28,33,47,103,176,215],"systems,":[29],"many":[31],"commercial":[32],"systems":[34,48],"have":[35],"search":[38,50],"engines":[39],"as":[40,93,119,161],"their":[41],"retrieval":[43],"systems.":[44,216],"Typically,":[45],"such":[46,92,160],"retrieve":[49],"results,":[51,57],"download":[52],"HTML":[53,65,114,129,152,156,184,196,213],"sources":[54],"then":[59],"extract":[60],"plain":[61,117,133],"texts":[62],"from":[63],"sources.":[66],"Plain":[67],"text":[68,118,134],"documents":[69],"or":[70],"chunks":[71],"are":[72,221],"fed":[73],"into":[74],"LLMs":[76,143],"augment":[78],"generation.":[80],"However,":[81,150],"much":[82],"structural":[85],"semantic":[87],"information":[88],"inherent":[89],"HTML,":[91],"headings":[94],"table":[96],"structures,":[97],"lost":[99],"during":[100],"this":[101,107,180],"plain-text-based":[102],"process.":[104],"To":[105,178],"problem,":[108],"we":[109,182],"propose":[110,183],"HtmlRAG,":[111],"which":[112,167],"uses":[113],"instead":[115],"format":[121],"retrieved":[123],"RAG.":[126],"We":[127],"believe":[128],"better":[131],"than":[132],"modeling":[136],"documents,":[140],"most":[142],"possess":[144],"robust":[145],"capacities":[146],"understand":[148],"HTML.":[149],"utilizing":[151],"presents":[153],"new":[154],"challenges.":[155],"contains":[157],"additional":[158],"content":[159],"tags,":[162],"JavaScript,":[163],"CSS":[165],"specifications,":[166],"bring":[168],"extra":[169],"input":[170],"tokens":[171],"noise":[173],"system.":[177],"address":[179],"issue,":[181],"cleaning,":[185],"compression,":[186],"two-step":[189],"block-tree-based":[190],"pruning":[191],"strategy,":[192],"shorten":[194],"while":[197],"minimizing":[198],"loss":[200],"information.":[202],"Experiments":[203],"on":[204],"six":[205],"QA":[206],"datasets":[207,220],"confirm":[208],"superiority":[210],"using":[212],"Our":[217],"code":[218],"available":[222],"at":[223],"https://github.com/plageon/HtmlRAG.":[224]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":7}],"updated_date":"2026-04-13T07:58:08.660418","created_date":"2025-10-10T00:00:00"}
