{"id":"https://openalex.org/W2089069287","doi":"https://doi.org/10.1145/1967486.1967507","title":"Extracting XML data from the web","display_name":"Extracting XML data from the web","publication_year":2010,"publication_date":"2010-11-08","ids":{"openalex":"https://openalex.org/W2089069287","doi":"https://doi.org/10.1145/1967486.1967507","mag":"2089069287"},"language":"en","primary_location":{"id":"doi:10.1145/1967486.1967507","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1967486.1967507","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 12th International Conference on Information Integration and Web-based Applications &amp; Services","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5000806503","display_name":"Ngo Sy Viet Phu","orcid":null},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Ngo Sy Viet Phu","raw_affiliation_strings":["University of Tsukuba, Japan"],"affiliations":[{"raw_affiliation_string":"University of Tsukuba, Japan","institution_ids":["https://openalex.org/I146399215"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025604052","display_name":"Toshiyuki Amagasa","orcid":"https://orcid.org/0000-0003-0595-2230"},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Toshiyuki Amagasa","raw_affiliation_strings":["University of Tsukuba, Japan"],"affiliations":[{"raw_affiliation_string":"University of Tsukuba, Japan","institution_ids":["https://openalex.org/I146399215"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5064322519","display_name":"Hiroyuki Kitagawa","orcid":"https://orcid.org/0000-0002-2984-2258"},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hiroyuki Kitagawa","raw_affiliation_strings":["University of Tsukuba, Japan"],"affiliations":[{"raw_affiliation_string":"University of Tsukuba, Japan","institution_ids":["https://openalex.org/I146399215"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5000806503"],"corresponding_institution_ids":["https://openalex.org/I146399215"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15983729,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"109","last_page":"116"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9943000078201294,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9855999946594238,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8397697806358337},{"id":"https://openalex.org/keywords/xml-validation","display_name":"XML validation","score":0.7699565887451172},{"id":"https://openalex.org/keywords/efficient-xml-interchange","display_name":"Efficient XML Interchange","score":0.769206166267395},{"id":"https://openalex.org/keywords/xml-schema-editor","display_name":"XML Schema Editor","score":0.7099361419677734},{"id":"https://openalex.org/keywords/xml-database","display_name":"XML database","score":0.6938928365707397},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6699076890945435},{"id":"https://openalex.org/keywords/document-structure-description","display_name":"Document Structure Description","score":0.6525439620018005},{"id":"https://openalex.org/keywords/streaming-xml","display_name":"Streaming XML","score":0.5859223008155823},{"id":"https://openalex.org/keywords/xml-schema","display_name":"XML Schema (W3C)","score":0.5749157667160034},{"id":"https://openalex.org/keywords/xml","display_name":"XML","score":0.5645461082458496},{"id":"https://openalex.org/keywords/xml-encryption","display_name":"XML Encryption","score":0.4795396327972412},{"id":"https://openalex.org/keywords/xml-signature","display_name":"XML Signature","score":0.46567031741142273},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.39267975091934204},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.38735443353652954},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.2504968047142029}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8397697806358337},{"id":"https://openalex.org/C55348073","wikidata":"https://www.wikidata.org/wiki/Q595926","display_name":"XML validation","level":3,"score":0.7699565887451172},{"id":"https://openalex.org/C11508877","wikidata":"https://www.wikidata.org/wiki/Q1124477","display_name":"Efficient XML Interchange","level":3,"score":0.769206166267395},{"id":"https://openalex.org/C34716815","wikidata":"https://www.wikidata.org/wiki/Q8042322","display_name":"XML Schema Editor","level":3,"score":0.7099361419677734},{"id":"https://openalex.org/C183068750","wikidata":"https://www.wikidata.org/wiki/Q357393","display_name":"XML database","level":3,"score":0.6938928365707397},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6699076890945435},{"id":"https://openalex.org/C68699486","wikidata":"https://www.wikidata.org/wiki/Q265904","display_name":"Document Structure Description","level":3,"score":0.6525439620018005},{"id":"https://openalex.org/C44883583","wikidata":"https://www.wikidata.org/wiki/Q7622687","display_name":"Streaming XML","level":3,"score":0.5859223008155823},{"id":"https://openalex.org/C40713593","wikidata":"https://www.wikidata.org/wiki/Q16342","display_name":"XML Schema (W3C)","level":5,"score":0.5749157667160034},{"id":"https://openalex.org/C8797682","wikidata":"https://www.wikidata.org/wiki/Q2115","display_name":"XML","level":2,"score":0.5645461082458496},{"id":"https://openalex.org/C173242113","wikidata":"https://www.wikidata.org/wiki/Q607488","display_name":"XML Encryption","level":4,"score":0.4795396327972412},{"id":"https://openalex.org/C34330436","wikidata":"https://www.wikidata.org/wiki/Q979532","display_name":"XML Signature","level":4,"score":0.46567031741142273},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.39267975091934204},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.38735443353652954},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.2504968047142029}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/1967486.1967507","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1967486.1967507","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 12th International Conference on Information Integration and Web-based Applications &amp; Services","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1117780231","display_name":null,"funder_award_id":"21700093","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G856843734","display_name":null,"funder_award_id":"21013004","funder_id":"https://openalex.org/F4320320912","funder_display_name":"Ministry of Education, Culture, Sports, Science and Technology"}],"funders":[{"id":"https://openalex.org/F4320320912","display_name":"Ministry of Education, Culture, Sports, Science and Technology","ror":"https://ror.org/048rj2z13"},{"id":"https://openalex.org/F4320334764","display_name":"Japan Society for the Promotion of Science","ror":"https://ror.org/00hhkn466"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W167149422","https://openalex.org/W1486332463","https://openalex.org/W1489949474","https://openalex.org/W1493490255","https://openalex.org/W1966446268","https://openalex.org/W2060565333","https://openalex.org/W2082225138","https://openalex.org/W2096891167","https://openalex.org/W2103931177","https://openalex.org/W2129629757","https://openalex.org/W2133640934","https://openalex.org/W2134150392","https://openalex.org/W2148540243","https://openalex.org/W2149997683","https://openalex.org/W2165286227","https://openalex.org/W6629638141"],"related_works":["https://openalex.org/W2536894089","https://openalex.org/W2357844625","https://openalex.org/W2047342127","https://openalex.org/W2038564770","https://openalex.org/W2385975321","https://openalex.org/W1509918510","https://openalex.org/W801203902","https://openalex.org/W2997449093","https://openalex.org/W2106024890","https://openalex.org/W2124122806"],"abstract_inverted_index":{"Information":[0],"Extraction":[1],"(IE)":[2],"is":[3],"a":[4,60,90,106,116,138],"technique":[5],"to":[6,89,115],"extract":[7],"structured":[8],"information":[9,78],"(record)":[10],"from":[11,79,124,128],"unstructured":[12],"documents":[13],"such":[14,28],"as":[15,29],"Web":[16],"pages.":[17],"However,":[18],"existing":[19,56],"techniques":[20],"are":[21,103,113],"basically":[22],"aiming":[23],"at":[24],"extracting":[25,48],"simple":[26],"records,":[27],"binary":[30,109],"relationships":[31],"like":[32,38,51],"(company,":[33],"location)":[34],"or":[35],"named":[36],"entities":[37],"(organization).":[39],"In":[40],"this":[41],"paper,":[42],"we":[43,73,84],"propose":[44,146],"an":[45,55,147],"algorithm":[46,163],"for":[47,150],"complex":[49],"records":[50,64,88,94],"XML":[52,69,81,87,122,153],"by":[53],"utilizing":[54],"IE":[57],"technique.":[58],"Given":[59],"set":[61,91,107],"of":[62,68,92,96,108,131,160],"seed":[63],"in":[65,164],"the":[66,76,80,86,125,129,132,158],"form":[67],"data":[70,123],"(XML":[71],"records),":[72],"firstly":[74],"infer":[75],"schema":[77],"records.":[82],"Then,":[83],"transform":[85],"relational":[93,101],"consisting":[95],"several":[97],"tables.":[98],"The":[99],"obtained":[100,127],"tables":[102],"decomposed":[104],"into":[105],"relations,":[110],"and":[111,145],"they":[112],"forwarded":[114],"record":[117,130,154],"extraction":[118,133],"system.":[119,134],"We":[120,135,156],"reconstruct":[121],"results":[126],"point":[136],"out":[137],"naive":[139],"implementation":[140],"docs":[141],"not":[142],"work":[143],"well,":[144],"improved":[148],"scheme":[149],"more":[151],"efficient":[152],"extraction.":[155],"evaluate":[157],"effectiveness":[159],"our":[161],"proposed":[162],"some":[165],"experiments.":[166]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
