{"id":"https://openalex.org/W7123962553","doi":"https://doi.org/10.1109/mmsp64401.2025.11324237","title":"Rethinking Document Layout Analysis through Text Clustering via Multi-Modal Graph Convolution Networks","display_name":"Rethinking Document Layout Analysis through Text Clustering via Multi-Modal Graph Convolution Networks","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7123962553","doi":"https://doi.org/10.1109/mmsp64401.2025.11324237"},"language":null,"primary_location":{"id":"doi:10.1109/mmsp64401.2025.11324237","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp64401.2025.11324237","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5122927726","display_name":"Wenxi Li","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenxi Li","raw_affiliation_strings":["Tsinghua University,Beijing National Research Center for Information Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Beijing National Research Center for Information Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093130749","display_name":"Chenyang Lyu","orcid":"https://orcid.org/0009-0002-6733-5879"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chenyang Lyu","raw_affiliation_strings":["Alibaba International Digital Commerce,AI Business,Hangzhou,China"],"affiliations":[{"raw_affiliation_string":"Alibaba International Digital Commerce,AI Business,Hangzhou,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101592657","display_name":"Wei Ji","orcid":"https://orcid.org/0000-0001-9526-9886"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]},{"id":"https://openalex.org/I4387153335","display_name":"China Telecom","ror":"https://ror.org/05p67dv18","country_code":null,"type":"company","lineage":["https://openalex.org/I4387153335"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Ji","raw_affiliation_strings":["China Telecom,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"China Telecom,Shanghai,China","institution_ids":["https://openalex.org/I4210136246","https://openalex.org/I4387153335"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071890018","display_name":"Liting Zhou","orcid":"https://orcid.org/0000-0002-7778-8743"},"institutions":[{"id":"https://openalex.org/I42934936","display_name":"Dublin City University","ror":"https://ror.org/04a1a1e81","country_code":"IE","type":"education","lineage":["https://openalex.org/I42934936"]}],"countries":["IE"],"is_corresponding":false,"raw_author_name":"Liting Zhou","raw_affiliation_strings":["Dublin City University,Dublin,Ireland"],"affiliations":[{"raw_affiliation_string":"Dublin City University,Dublin,Ireland","institution_ids":["https://openalex.org/I42934936"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095025991","display_name":"Cathal Gurrin","orcid":null},"institutions":[{"id":"https://openalex.org/I42934936","display_name":"Dublin City University","ror":"https://ror.org/04a1a1e81","country_code":"IE","type":"education","lineage":["https://openalex.org/I42934936"]}],"countries":["IE"],"is_corresponding":false,"raw_author_name":"Cathal Gurrin","raw_affiliation_strings":["Dublin City University,Dublin,Ireland"],"affiliations":[{"raw_affiliation_string":"Dublin City University,Dublin,Ireland","institution_ids":["https://openalex.org/I42934936"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101676444","display_name":"Y. F. Guo","orcid":"https://orcid.org/0000-0002-1227-1663"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuchen Guo","raw_affiliation_strings":["Tsinghua University,Beijing National Research Center for Information Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Beijing National Research Center for Information Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5122927726"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.69168708,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"352","last_page":"357"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9300000071525574,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9300000071525574,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.024800000712275505,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10789","display_name":"Interactive and Immersive Displays","score":0.006099999882280827,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/document-layout-analysis","display_name":"Document layout analysis","score":0.8406000137329102},{"id":"https://openalex.org/keywords/document-clustering","display_name":"Document clustering","score":0.6818000078201294},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.6014000177383423},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.5184000134468079},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.4869000017642975},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.42590001225471497},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.4180000126361847},{"id":"https://openalex.org/keywords/text-graph","display_name":"Text graph","score":0.4106000065803528},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.3237999975681305}],"concepts":[{"id":"https://openalex.org/C72773152","wikidata":"https://www.wikidata.org/wiki/Q5287629","display_name":"Document layout analysis","level":3,"score":0.8406000137329102},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8023999929428101},{"id":"https://openalex.org/C177937566","wikidata":"https://www.wikidata.org/wiki/Q4223102","display_name":"Document clustering","level":3,"score":0.6818000078201294},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.6014000177383423},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.5184000134468079},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4869000017642975},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.459199994802475},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4262000024318695},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.42590001225471497},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.4180000126361847},{"id":"https://openalex.org/C66945725","wikidata":"https://www.wikidata.org/wiki/Q18388823","display_name":"Text graph","level":3,"score":0.4106000065803528},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3637999892234802},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.3237999975681305},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.32359999418258667},{"id":"https://openalex.org/C2778371909","wikidata":"https://www.wikidata.org/wiki/Q3771738","display_name":"Historical document","level":2,"score":0.3231000006198883},{"id":"https://openalex.org/C68699486","wikidata":"https://www.wikidata.org/wiki/Q265904","display_name":"Document Structure Description","level":3,"score":0.32199999690055847},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.31619998812675476},{"id":"https://openalex.org/C2779500292","wikidata":"https://www.wikidata.org/wiki/Q14802672","display_name":"Text processing","level":2,"score":0.3057999908924103},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3001999855041504},{"id":"https://openalex.org/C67905146","wikidata":"https://www.wikidata.org/wiki/Q5287646","display_name":"Document processing","level":2,"score":0.2957000136375427},{"id":"https://openalex.org/C112953755","wikidata":"https://www.wikidata.org/wiki/Q739462","display_name":"Graph drawing","level":3,"score":0.295199990272522},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.2858000099658966},{"id":"https://openalex.org/C2911174283","wikidata":"https://www.wikidata.org/wiki/Q739462","display_name":"Graph Layout","level":4,"score":0.2759000062942505},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C98501671","wikidata":"https://www.wikidata.org/wiki/Q1948408","display_name":"Text segmentation","level":3,"score":0.2678000032901764},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.26510000228881836},{"id":"https://openalex.org/C88230418","wikidata":"https://www.wikidata.org/wiki/Q131476","display_name":"Graph theory","level":2,"score":0.26429998874664307},{"id":"https://openalex.org/C2988504005","wikidata":"https://www.wikidata.org/wiki/Q379942","display_name":"Document image processing","level":4,"score":0.257999986410141}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/mmsp64401.2025.11324237","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp64401.2025.11324237","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W2769316437","https://openalex.org/W2889055479","https://openalex.org/W2934198733","https://openalex.org/W2963150697","https://openalex.org/W2963351448","https://openalex.org/W2963849369","https://openalex.org/W2964241181","https://openalex.org/W2965744772","https://openalex.org/W2980913015","https://openalex.org/W2982770724","https://openalex.org/W2997154779","https://openalex.org/W3003711898","https://openalex.org/W3003791552","https://openalex.org/W3034404784","https://openalex.org/W3035669277","https://openalex.org/W3110340916","https://openalex.org/W3113410735","https://openalex.org/W3117707016","https://openalex.org/W3156636935","https://openalex.org/W3157758108","https://openalex.org/W3159307593","https://openalex.org/W4214613769","https://openalex.org/W4290927927","https://openalex.org/W4386008071","https://openalex.org/W4386083123"],"related_works":[],"abstract_inverted_index":{"Document":[0],"layout":[1,146],"analysis,":[2,147],"a":[3,41,54,84,103,140],"critical":[4],"process":[5],"in":[6,30,127,156],"automated":[7,152],"document":[8,45,130,145,153,161],"processing,":[9],"traditionally":[10],"relies":[11],"on":[12,18,115],"object":[13],"detection":[14,70],"techniques,":[15],"primarily":[16],"focusing":[17],"the":[19,33,37,106,136,149],"structural":[20,69,109],"segmentation":[21],"of":[22,44,111,151],"documents.":[23,112],"However,":[24],"these":[25],"approaches":[26],"often":[27],"fall":[28],"short":[29],"comprehensively":[31],"understanding":[32,128],"semantic":[34,72],"content":[35],"within":[36],"text,":[38],"leading":[39],"to":[40,67,99,135,144],"disjointed":[42],"analysis":[43],"structure":[46],"and":[47,92,108,159],"content.":[48],"To":[49],"address":[50],"this,":[51],"we":[52,89],"propose":[53],"novel":[55],"methodology":[56],"that":[57,119],"combines":[58],"text":[59,78],"clustering":[60],"with":[61,71,77],"multi-modal":[62],"graph":[63],"convolution":[64],"networks,":[65],"aiming":[66],"integrate":[68,90],"understanding.":[73],"Our":[74],"approach":[75,143],"starts":[76],"detection,":[79],"followed":[80],"by":[81,138],"encoding":[82],"using":[83,95],"large":[85],"language":[86],"model.":[87],"Subsequently,":[88],"visual":[91],"positional":[93],"data":[94],"Graph":[96],"Neural":[97],"Networks":[98],"perform":[100],"clustering,":[101],"creating":[102],"synergy":[104],"between":[105],"textual":[107],"aspects":[110],"Extensive":[113],"experiments":[114],"mainstream":[116],"datasets":[117],"demonstrate":[118],"our":[120],"method":[121],"significantly":[122],"outperforms":[123],"existing":[124],"approaches,":[125],"especially":[126],"text-centric":[129],"layouts.":[131],"This":[132],"paper":[133],"contributes":[134],"field":[137],"offering":[139],"novel,":[141],"semantically-enriched":[142],"enhancing":[148],"capabilities":[150],"processing":[154],"systems":[155],"handling":[157],"diverse":[158],"complex":[160],"formats.":[162]},"counts_by_year":[],"updated_date":"2026-01-14T23:44:37.837170","created_date":"2026-01-14T00:00:00"}
