{"id":"https://openalex.org/W4410637036","doi":"https://doi.org/10.1145/3701716.3715289","title":"EDGAR-CRAWLER: From Raw Web Documents to Structured Financial NLP Datasets","display_name":"EDGAR-CRAWLER: From Raw Web Documents to Structured Financial NLP Datasets","publication_year":2025,"publication_date":"2025-05-08","ids":{"openalex":"https://openalex.org/W4410637036","doi":"https://doi.org/10.1145/3701716.3715289"},"language":"en","primary_location":{"id":"doi:10.1145/3701716.3715289","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3701716.3715289","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3715289","source":null,"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3715289","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5017613411","display_name":"Lefteris Loukas","orcid":"https://orcid.org/0000-0002-7473-9428"},"institutions":[{"id":"https://openalex.org/I73142707","display_name":"Athens University of Economics and Business","ror":"https://ror.org/03s262162","country_code":"GR","type":"education","lineage":["https://openalex.org/I73142707"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Lefteris Loukas","raw_affiliation_strings":["Department of Informatics, Athens University of Economics and Business, Athens, Greece"],"raw_orcid":"https://orcid.org/0000-0002-7473-9428","affiliations":[{"raw_affiliation_string":"Department of Informatics, Athens University of Economics and Business, Athens, Greece","institution_ids":["https://openalex.org/I73142707"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092596169","display_name":"Fabian Billert","orcid":null},"institutions":[{"id":"https://openalex.org/I44260953","display_name":"Heinrich Heine University D\u00fcsseldorf","ror":"https://ror.org/024z2rq82","country_code":"DE","type":"education","lineage":["https://openalex.org/I44260953"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Fabian Billert","raw_affiliation_strings":["Heinrich Heine University of D\u00fcsseldorf, D\u00fcsseldorf, Germany"],"raw_orcid":"https://orcid.org/0009-0005-0311-2176","affiliations":[{"raw_affiliation_string":"Heinrich Heine University of D\u00fcsseldorf, D\u00fcsseldorf, Germany","institution_ids":["https://openalex.org/I44260953"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008240283","display_name":"Manos Fergadiotis","orcid":"https://orcid.org/0000-0002-7657-5156"},"institutions":[{"id":"https://openalex.org/I73142707","display_name":"Athens University of Economics and Business","ror":"https://ror.org/03s262162","country_code":"GR","type":"education","lineage":["https://openalex.org/I73142707"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Manos Fergadiotis","raw_affiliation_strings":["Department of Informatics, Athens University of Economics and Business, Athens, Greece"],"raw_orcid":"https://orcid.org/0000-0002-7657-5156","affiliations":[{"raw_affiliation_string":"Department of Informatics, Athens University of Economics and Business, Athens, Greece","institution_ids":["https://openalex.org/I73142707"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016618602","display_name":"Prodromos Malakasiotis","orcid":"https://orcid.org/0009-0008-0055-5598"},"institutions":[{"id":"https://openalex.org/I73142707","display_name":"Athens University of Economics and Business","ror":"https://ror.org/03s262162","country_code":"GR","type":"education","lineage":["https://openalex.org/I73142707"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Prodromos Malakasiotis","raw_affiliation_strings":["Department of Informatics, Athens University of Economics and Business, Athens, Greece"],"raw_orcid":"https://orcid.org/0009-0008-0055-5598","affiliations":[{"raw_affiliation_string":"Department of Informatics, Athens University of Economics and Business, Athens, Greece","institution_ids":["https://openalex.org/I73142707"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069270736","display_name":"Ion Androutsopoulos","orcid":"https://orcid.org/0009-0000-2969-0509"},"institutions":[{"id":"https://openalex.org/I73142707","display_name":"Athens University of Economics and Business","ror":"https://ror.org/03s262162","country_code":"GR","type":"education","lineage":["https://openalex.org/I73142707"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Ion Androutsopoulos","raw_affiliation_strings":["Department of Informatics, Athens University of Economics and Business, Athens, Greece"],"raw_orcid":"https://orcid.org/0009-0000-2969-0509","affiliations":[{"raw_affiliation_string":"Department of Informatics, Athens University of Economics and Business, Athens, Greece","institution_ids":["https://openalex.org/I73142707"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.6264,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.94240719,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"761","last_page":"764"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11326","display_name":"Stock Market Forecasting Methods","score":0.9878000020980835,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11326","display_name":"Stock Market Forecasting Methods","score":0.9878000020980835,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10081","display_name":"Auditing, Earnings Management, Governance","score":0.9757000207901001,"subfield":{"id":"https://openalex.org/subfields/1402","display_name":"Accounting"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13794","display_name":"Financial Reporting and XBRL","score":0.9735000133514404,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/web-crawler","display_name":"Web crawler","score":0.9121544361114502},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.711010754108429},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5484890341758728},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5292659997940063},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4689962863922119},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.4446151852607727}],"concepts":[{"id":"https://openalex.org/C13743948","wikidata":"https://www.wikidata.org/wiki/Q45842","display_name":"Web crawler","level":2,"score":0.9121544361114502},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.711010754108429},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5484890341758728},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5292659997940063},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4689962863922119},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.4446151852607727}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3701716.3715289","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3701716.3715289","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3715289","source":null,"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3701716.3715289","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3701716.3715289","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3715289","source":null,"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/8","score":0.49000000953674316,"display_name":"Decent work and economic growth"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4410637036.pdf","grobid_xml":"https://content.openalex.org/works/W4410637036.grobid-xml"},"referenced_works_count":12,"referenced_works":["https://openalex.org/W2063595321","https://openalex.org/W2771976988","https://openalex.org/W2914181088","https://openalex.org/W3121646650","https://openalex.org/W3121742200","https://openalex.org/W3122472793","https://openalex.org/W4206729293","https://openalex.org/W4285137661","https://openalex.org/W4285333908","https://openalex.org/W4385571939","https://openalex.org/W4394769093","https://openalex.org/W4404472259"],"related_works":["https://openalex.org/W2152505903","https://openalex.org/W2054759010","https://openalex.org/W3202833648","https://openalex.org/W2066301148","https://openalex.org/W2626977282","https://openalex.org/W2566625334","https://openalex.org/W2733224566","https://openalex.org/W3182846679","https://openalex.org/W2081888991","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Acquiring":[0],"data":[1,17,164],"for":[2,39,102,153],"resource-limited":[3],"domains":[4],"like":[5],"financial":[6,16,121,154],"nlp":[7,82,135],"presents":[8],"numerous":[9],"challenges.One":[10],"rich":[11],"web":[12,162],"resource":[13],"of":[14,45,75,88,112,149],"textual":[15],"is":[18,49,178],"edgar,":[19],"the":[20,43,86,97,147],"Electronic":[21],"Data":[22],"Gathering,":[23],"Analysis,":[24],"and":[25,63,105,125,140,174],"Retrieval":[26],"system,":[27],"where":[28],"U.S.":[29],"legislation":[30],"requires":[31],"publicly":[32],"traded":[33],"companies":[34],"to":[35,79,160],"upload":[36],"economic":[37],"reports":[38],"investor":[40],"transparency.However,":[41],"automating":[42],"process":[44],"downloading":[46,104],"edgar":[47,107],"filings":[48],"not":[50],"enough.These":[51],"long-form":[52],"documents,":[53],"often":[54,70],"exceeding":[55],"100":[56],"pages":[57],"each,":[58],"include":[59],"noisy":[60],"text,":[61],"images,":[62],"tables,":[64],"requiring":[65],"significant":[66],"manual":[67],"preprocessing.Moreover,":[68],"researchers":[69],"require":[71],"only":[72],"specific":[73,81],"sections":[74],"these":[76,89,92],"lengthy":[77,119],"documents":[78,122],"solve":[80],"tasks,":[83],"further":[84],"complicating":[85],"use":[87],"filings.To":[90],"address":[91],"challenges,":[93],"we":[94],"present":[95],"edgar-crawler,":[96],"first":[98],"Open-Source":[99],"Software":[100],"(oss)":[101],"automatically":[103],"preprocessing":[106],"filings.Equipped":[108],"with":[109],"a":[110,157],"variety":[111],"document":[113],"mining":[114],"options,":[115,144],"edgar-crawler":[116,145,166],"can":[117,130],"convert":[118],"raw":[120],"into":[123,134],"clean":[124],"section-specific":[126],"json":[127],"files,":[128],"which":[129],"be":[131],"integrated":[132],"seamlessly":[133],"workflows.Supporting":[136],"multiple":[137],"filing":[138],"types":[139],"offering":[141],"extensive":[142],"filtering":[143],"facilitates":[146],"creation":[148],"structured":[150],"large-scale":[151],"datasets":[152],"research.Serving":[155],"as":[156],"free":[158],"alternative":[159],"premium":[161],"service":[163],"providers,":[165],"has":[167],"been":[168],"widely":[169],"adopted":[170],"by":[171],"both":[172],"practitioners":[173],"academic":[175],"researchers.Our":[176],"software":[177],"available":[179],"at":[180],"https://github.com/lefterisloukas/edgar-crawler.":[181]},"counts_by_year":[{"year":2026,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
