{"id":"https://openalex.org/W4411287357","doi":"https://doi.org/10.1007/s10791-025-09638-7","title":"Max\u2013Min semantic chunking of documents for RAG application","display_name":"Max\u2013Min semantic chunking of documents for RAG application","publication_year":2025,"publication_date":"2025-06-13","ids":{"openalex":"https://openalex.org/W4411287357","doi":"https://doi.org/10.1007/s10791-025-09638-7"},"language":"en","primary_location":{"id":"doi:10.1007/s10791-025-09638-7","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10791-025-09638-7","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10791-025-09638-7.pdf","source":{"id":"https://openalex.org/S5407036663","display_name":"Discover Computing","issn_l":"2948-2992","issn":["2948-2992"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Discover Computing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://link.springer.com/content/pdf/10.1007/s10791-025-09638-7.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5043368575","display_name":"Csaba Kiss","orcid":"https://orcid.org/0000-0002-0237-0887"},"institutions":[{"id":"https://openalex.org/I29770179","display_name":"Budapest University of Technology and Economics","ror":"https://ror.org/02w42ss30","country_code":"HU","type":"education","lineage":["https://openalex.org/I29770179"]}],"countries":["HU"],"is_corresponding":true,"raw_author_name":"Csaba Kiss","raw_affiliation_strings":["Department of Stochastics, Institute of Mathematics,  Budapest University of Technology and Economics, M\u0171egyetem rkp. 3., Budapest, 1111, Hungary"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Stochastics, Institute of Mathematics,  Budapest University of Technology and Economics, M\u0171egyetem rkp. 3., Budapest, 1111, Hungary","institution_ids":["https://openalex.org/I29770179"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032944996","display_name":"Marcell Nagy","orcid":"https://orcid.org/0000-0001-5666-7777"},"institutions":[{"id":"https://openalex.org/I29770179","display_name":"Budapest University of Technology and Economics","ror":"https://ror.org/02w42ss30","country_code":"HU","type":"education","lineage":["https://openalex.org/I29770179"]}],"countries":["HU"],"is_corresponding":false,"raw_author_name":"Marcell Nagy","raw_affiliation_strings":["Department of Stochastics, Institute of Mathematics,  Budapest University of Technology and Economics, M\u0171egyetem rkp. 3., Budapest, 1111, Hungary"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Stochastics, Institute of Mathematics,  Budapest University of Technology and Economics, M\u0171egyetem rkp. 3., Budapest, 1111, Hungary","institution_ids":["https://openalex.org/I29770179"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5050054322","display_name":"P\u00e9ter Szil\u00e1gyi","orcid":"https://orcid.org/0000-0003-2106-6343"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"P\u00e9ter Szil\u00e1gyi","raw_affiliation_strings":["Nokia Bell Labs, Budapest, Hungary"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nokia Bell Labs, Budapest, Hungary","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5043368575"],"corresponding_institution_ids":["https://openalex.org/I29770179"],"apc_list":null,"apc_paid":null,"fwci":6.9247,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.96496073,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"28","issue":"1","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9890999794006348,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/chunking","display_name":"Chunking (psychology)","score":0.8344821929931641},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6169648766517639},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.562217116355896},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.39929550886154175},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3652505874633789}],"concepts":[{"id":"https://openalex.org/C203357204","wikidata":"https://www.wikidata.org/wiki/Q1089605","display_name":"Chunking (psychology)","level":2,"score":0.8344821929931641},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6169648766517639},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.562217116355896},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.39929550886154175},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3652505874633789}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1007/s10791-025-09638-7","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10791-025-09638-7","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10791-025-09638-7.pdf","source":{"id":"https://openalex.org/S5407036663","display_name":"Discover Computing","issn_l":"2948-2992","issn":["2948-2992"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Discover Computing","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:42e1da45be7f4f76bc3fd4f5c329566b","is_oa":true,"landing_page_url":"https://doaj.org/article/42e1da45be7f4f76bc3fd4f5c329566b","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Discover Computing, Vol 28, Iss 1, Pp 1-15 (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1007/s10791-025-09638-7","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10791-025-09638-7","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10791-025-09638-7.pdf","source":{"id":"https://openalex.org/S5407036663","display_name":"Discover Computing","issn_l":"2948-2992","issn":["2948-2992"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Discover Computing","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1228996196","display_name":null,"funder_award_id":"TKP2021-NVA-02","funder_id":"https://openalex.org/F4320311358","funder_display_name":"Budapesti M\u0171szaki \u00e9s Gazdas\u00e1gtudom\u00e1nyi Egyetem"}],"funders":[{"id":"https://openalex.org/F4320311358","display_name":"Budapesti M\u0171szaki \u00e9s Gazdas\u00e1gtudom\u00e1nyi Egyetem","ror":"https://ror.org/02w42ss30"},{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4411287357.pdf","grobid_xml":"https://content.openalex.org/works/W4411287357.grobid-xml"},"referenced_works_count":11,"referenced_works":["https://openalex.org/W1985700489","https://openalex.org/W2006255103","https://openalex.org/W2113054345","https://openalex.org/W2611726225","https://openalex.org/W3021397474","https://openalex.org/W3027879771","https://openalex.org/W4387259538","https://openalex.org/W4388778348","https://openalex.org/W4400531953","https://openalex.org/W4404715948","https://openalex.org/W4409183391"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2384729545","https://openalex.org/W2198395236","https://openalex.org/W4245487161","https://openalex.org/W2090755435","https://openalex.org/W2039036070","https://openalex.org/W2153813398","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Abstract":[0],"Retrieval-augmented":[1],"generation":[2,88],"(RAG)":[3],"systems":[4],"have":[5],"emerged":[6],"as":[7],"a":[8,56,63,93],"powerful":[9],"approach":[10,74],"to":[11,35,40,66],"enhance":[12],"large":[13],"language":[14],"model":[15],"(LLM)":[16],"outputs,":[17],"however,":[18],"their":[19],"effectiveness":[20],"heavily":[21],"depends":[22],"on":[23,75,92],"document":[24],"chunking":[25,104],"strategies.":[26],"Current":[27],"methods,":[28],"often":[29],"arbitrary":[30],"or":[31],"size-based":[32],"segmentation,":[33],"fail":[34],"preserve":[36],"semantic":[37,54,60,103],"coherence,":[38],"leading":[39],"suboptimal":[41],"retrieval":[42],"and":[43,62,87,115],"reduced":[44],"output":[45],"quality.":[46],"To":[47],"overcome":[48],"this":[49],"limitation,":[50],"we":[51],"introduce":[52],"Max\u2013Min":[53,64,102],"chunking,":[55],"novel":[57],"method":[58],"utilizing":[59],"similarity":[61],"algorithm":[65],"identify":[67],"semantically":[68],"coherent":[69],"text.":[70],"We":[71],"evaluated":[72],"our":[73],"three":[76],"distinct":[77],"datasets,":[78,101],"assessing":[79],"clustering":[80],"efficiency":[81],"via":[82],"adjusted":[83],"mutual":[84],"information":[85],"(AMI)":[86],"coherence":[89],"through":[90],"accuracy":[91,118],"RAG-based":[94],"multiple-choice":[95],"question":[96],"answering":[97],"test.":[98],"Across":[99],"the":[100,127,131,143],"achieved":[105],"superior":[106],"performance":[107],"with":[108],"average":[109,117],"AMI":[110,144],"scores":[111,145],"of":[112,119],"0.85,":[113],"0.90,":[114],"an":[116],"0.56":[120],"(averaged":[121],"across":[122],"LLMs).":[123],"This":[124],"significantly":[125],"outperformed":[126],"next":[128],"best":[129],"method,":[130],"Llama":[132],"Semantic":[133],"Splitter":[134],"(AMI:":[135],"0.68,":[136],"0.70;":[137],"accuracy:":[138],"0.53).":[139],"The":[140],"improvements":[141],"in":[142],"were":[146],"statistically":[147],"significant.":[148]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3}],"updated_date":"2026-06-13T06:13:01.061226","created_date":"2025-10-10T00:00:00"}
